diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index a317cea6a9..5583ee0326 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -50,6 +50,12 @@
 from deepmd.pt.utils.stat import (
     make_stat_input,
 )
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
 from deepmd.utils.path import (
     DPPath,
 )
@@ -67,6 +73,11 @@ def get_trainer(
     force_load=False,
     init_frz_model=None,
 ):
+    # argcheck
+    if "model_dict" not in config.get("model", {}):
+        config = update_deepmd_input(config, warning=True, dump="input_v2_compat.json")
+        config = normalize(config)
+
     # Initialize DDP
     local_rank = os.environ.get("LOCAL_RANK")
     if local_rank is not None:
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index b616d20cd8..6850c550fe 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -43,7 +43,7 @@ def __init__(
         post_ln=True,
         ffn=False,
         ffn_embed_dim=1024,
-        activation="tanh",
+        activation_function="tanh",
         scaling_factor=1.0,
         head_num=1,
         normalize=True,
@@ -51,8 +51,30 @@ def __init__(
         return_rot=False,
         concat_output_tebd: bool = True,
         type: Optional[str] = None,
+        # not implemented
+        resnet_dt: bool = False,
+        type_one_side: bool = True,
+        precision: str = "default",
+        trainable: bool = True,
+        exclude_types: Optional[List[List[int]]] = None,
+        stripped_type_embedding: bool = False,
+        smooth_type_embdding: bool = False,
     ):
         super().__init__()
+        if resnet_dt:
+            raise NotImplementedError("resnet_dt is not supported.")
+        if not type_one_side:
+            raise NotImplementedError("type_one_side is not supported.")
+        if precision != "default" and precision != "float64":
+            raise NotImplementedError("precison is not supported.")
+        if not trainable:
+            raise NotImplementedError("trainable == False is not supported.")
+        if exclude_types is not None and exclude_types != []:
+            raise NotImplementedError("exclude_types is not supported.")
+        if stripped_type_embedding:
+            raise NotImplementedError("stripped_type_embedding is not supported.")
+        if smooth_type_embdding:
+            raise NotImplementedError("smooth_type_embdding is not supported.")
         del type
         self.se_atten = DescrptBlockSeAtten(
             rcut,
@@ -71,7 +93,7 @@ def __init__(
             post_ln=post_ln,
             ffn=ffn,
             ffn_embed_dim=ffn_embed_dim,
-            activation=activation,
+            activation_function=activation_function,
             scaling_factor=scaling_factor,
             head_num=head_num,
             normalize=normalize,
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index b1df56a004..55bb77b366 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -197,7 +197,7 @@ def __init__(
             tebd_input_mode="concat",
             # tebd_input_mode='dot_residual_s',
             set_davg_zero=repinit_set_davg_zero,
-            activation=repinit_activation,
+            activation_function=repinit_activation,
         )
         self.repformers = DescrptBlockRepformers(
             repformer_rcut,
@@ -223,7 +223,7 @@ def __init__(
             attn2_hidden=repformer_attn2_hidden,
             attn2_nhead=repformer_attn2_nhead,
             attn2_has_gate=repformer_attn2_has_gate,
-            activation=repformer_activation,
+            activation_function=repformer_activation,
             update_style=repformer_update_style,
             set_davg_zero=repformer_set_davg_zero,
             smooth=True,
diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py
index 55a2cba708..08fcb17b09 100644
--- a/deepmd/pt/model/descriptor/repformer_layer.py
+++ b/deepmd/pt/model/descriptor/repformer_layer.py
@@ -313,7 +313,7 @@ def __init__(
         attn2_hidden: int = 16,
         attn2_nhead: int = 4,
         attn2_has_gate: bool = False,
-        activation: str = "tanh",
+        activation_function: str = "tanh",
         update_style: str = "res_avg",
         set_davg_zero: bool = True,  # TODO
         smooth: bool = True,
@@ -332,7 +332,7 @@ def __init__(
         self.set_davg_zero = set_davg_zero
         self.do_bn_mode = do_bn_mode
         self.bn_momentum = bn_momentum
-        self.act = get_activation_fn(activation)
+        self.act = get_activation_fn(activation_function)
         self.update_g1_has_grrg = update_g1_has_grrg
         self.update_g1_has_drrd = update_g1_has_drrd
         self.update_g1_has_conv = update_g1_has_conv
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index ad523bcc2d..2425139e16 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -76,7 +76,7 @@ def __init__(
         attn2_hidden: int = 16,
         attn2_nhead: int = 4,
         attn2_has_gate: bool = False,
-        activation: str = "tanh",
+        activation_function: str = "tanh",
         update_style: str = "res_avg",
         set_davg_zero: bool = True,  # TODO
         smooth: bool = True,
@@ -109,7 +109,7 @@ def __init__(
         self.set_davg_zero = set_davg_zero
         self.g1_dim = g1_dim
         self.g2_dim = g2_dim
-        self.act = get_activation_fn(activation)
+        self.act = get_activation_fn(activation_function)
         self.direct_dist = direct_dist
         self.add_type_ebd_to_seq = add_type_ebd_to_seq
 
@@ -140,7 +140,7 @@ def __init__(
                     attn2_has_gate=attn2_has_gate,
                     attn2_hidden=attn2_hidden,
                     attn2_nhead=attn2_nhead,
-                    activation=activation,
+                    activation_function=activation_function,
                     update_style=update_style,
                     smooth=smooth,
                 )
diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
index 0b32bd9341..a2197213ad 100644
--- a/deepmd/pt/model/descriptor/se_atten.py
+++ b/deepmd/pt/model/descriptor/se_atten.py
@@ -53,7 +53,7 @@ def __init__(
         post_ln=True,
         ffn=False,
         ffn_embed_dim=1024,
-        activation="tanh",
+        activation_function="tanh",
         scaling_factor=1.0,
         head_num=1,
         normalize=True,
@@ -86,7 +86,7 @@ def __init__(
         self.post_ln = post_ln
         self.ffn = ffn
         self.ffn_embed_dim = ffn_embed_dim
-        self.activation = activation
+        self.activation = activation_function
         # TODO: To be fixed: precision should be given from inputs
         self.prec = torch.float64
         self.scaling_factor = scaling_factor
diff --git a/deepmd/tf/descriptor/se_atten.py b/deepmd/tf/descriptor/se_atten.py
index 1c3c48e484..35b354c8da 100644
--- a/deepmd/tf/descriptor/se_atten.py
+++ b/deepmd/tf/descriptor/se_atten.py
@@ -152,6 +152,16 @@ def __init__(
         multi_task: bool = False,
         stripped_type_embedding: bool = False,
         smooth_type_embdding: bool = False,
+        # not implemented
+        post_ln=True,
+        ffn=False,
+        ffn_embed_dim=1024,
+        scaling_factor=1.0,
+        head_num=1,
+        normalize=True,
+        temperature=None,
+        return_rot=False,
+        concat_output_tebd: bool = True,
         **kwargs,
     ) -> None:
         if not set_davg_zero and not (stripped_type_embedding and smooth_type_embdding):
@@ -159,6 +169,24 @@ def __init__(
                 "Set 'set_davg_zero' False in descriptor 'se_atten' "
                 "may cause unexpected incontinuity during model inference!"
             )
+        if not post_ln:
+            raise NotImplementedError("post_ln is not supported.")
+        if ffn:
+            raise NotImplementedError("ffn is not supported.")
+        if ffn_embed_dim != 1024:
+            raise NotImplementedError("ffn_embed_dim is not supported.")
+        if scaling_factor != 1.0:
+            raise NotImplementedError("scaling_factor is not supported.")
+        if head_num != 1:
+            raise NotImplementedError("head_num is not supported.")
+        if not normalize:
+            raise NotImplementedError("normalize is not supported.")
+        if temperature is not None:
+            raise NotImplementedError("temperature is not supported.")
+        if return_rot:
+            raise NotImplementedError("return_rot is not supported.")
+        if not concat_output_tebd:
+            raise NotImplementedError("concat_output_tebd is not supported.")
         DescrptSeA.__init__(
             self,
             rcut,
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index dbe4881952..8366f7bb38 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -45,6 +45,9 @@
     "bfloat16": None,
 }
 
+doc_only_tf_supported = "(Supported Backend: TensorFlow) "
+doc_only_pt_supported = "(Supported Backend: PyTorch) "
+
 
 def list_to_doc(xx):
     items = []
@@ -109,7 +112,7 @@ def __init__(self) -> None:
         self.__plugin = Plugin()
 
     def register(
-        self, name: str, alias: Optional[List[str]] = None
+        self, name: str, alias: Optional[List[str]] = None, doc: str = ""
     ) -> Callable[[], List[Argument]]:
         """Register a descriptor argument plugin.
 
@@ -135,7 +138,7 @@ def descrpt_some_descrpt_args():
         # convert alias to hashed item
         if isinstance(alias, list):
             alias = tuple(alias)
-        return self.__plugin.register((name, alias))
+        return self.__plugin.register((name, alias, doc))
 
     def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
         """Get all arguments.
@@ -151,11 +154,11 @@ def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
             all arguments
         """
         arguments = []
-        for (name, alias), metd in self.__plugin.plugins.items():
+        for (name, alias, doc), metd in self.__plugin.plugins.items():
             if exclude_hybrid and name == "hybrid":
                 continue
             arguments.append(
-                Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias)
+                Argument(name=name, dtype=dict, sub_fields=metd(), alias=alias, doc=doc)
             )
         return arguments
 
@@ -163,7 +166,7 @@ def get_all_argument(self, exclude_hybrid: bool = False) -> List[Argument]:
 descrpt_args_plugin = ArgsPlugin()
 
 
-@descrpt_args_plugin.register("loc_frame")
+@descrpt_args_plugin.register("loc_frame", doc=doc_only_tf_supported)
 def descrpt_local_frame_args():
     doc_sel_a = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor."
     doc_sel_r = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius."
@@ -244,7 +247,9 @@ def descrpt_se_a_args():
     ]
 
 
-@descrpt_args_plugin.register("se_e3", alias=["se_at", "se_a_3be", "se_t"])
+@descrpt_args_plugin.register(
+    "se_e3", alias=["se_at", "se_a_3be", "se_t"], doc=doc_only_tf_supported
+)
 def descrpt_se_t_args():
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
     - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
@@ -283,7 +288,7 @@ def descrpt_se_t_args():
     ]
 
 
-@descrpt_args_plugin.register("se_a_tpe", alias=["se_a_ebd"])
+@descrpt_args_plugin.register("se_a_tpe", alias=["se_a_ebd"], doc=doc_only_tf_supported)
 def descrpt_se_a_tpe_args():
     doc_type_nchanl = "number of channels for type embedding"
     doc_type_nlayer = "number of hidden layers of type embedding net"
@@ -348,7 +353,7 @@ def descrpt_se_r_args():
     ]
 
 
-@descrpt_args_plugin.register("hybrid")
+@descrpt_args_plugin.register("hybrid", doc=doc_only_tf_supported)
 def descrpt_hybrid_args():
     doc_list = "A list of descriptor definitions"
 
@@ -376,12 +381,25 @@ def descrpt_se_atten_common_args():
     doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
     doc_axis_neuron = "Size of the submatrix of G (embedding matrix)."
     doc_activation_function = f'The activation function in the embedding net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
-    doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
-    doc_type_one_side = r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
-    doc_precision = f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
-    doc_trainable = "If the parameters in the embedding net is trainable"
+    doc_resnet_dt = (
+        doc_only_tf_supported + 'Whether to use a "Timestep" in the skip connection'
+    )
+    doc_type_one_side = (
+        doc_only_tf_supported
+        + r"If true, the embedding network parameters vary by types of neighbor atoms only, so there will be $N_\text{types}$ sets of embedding network parameters. Otherwise, the embedding network parameters vary by types of centric atoms and types of neighbor atoms, so there will be $N_\text{types}^2$ sets of embedding network parameters."
+    )
+    doc_precision = (
+        doc_only_tf_supported
+        + f"The precision of the embedding net parameters, supported options are {list_to_doc(PRECISION_DICT.keys())} Default follows the interface precision."
+    )
+    doc_trainable = (
+        doc_only_tf_supported + "If the parameters in the embedding net is trainable"
+    )
     doc_seed = "Random seed for parameter initialization"
-    doc_exclude_types = "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    doc_exclude_types = (
+        doc_only_tf_supported
+        + "The excluded pairs of types which have no interaction with each other. For example, `[[0, 1]]` means no interaction between type 0 and type 1."
+    )
     doc_attn = "The length of hidden vectors in attention layers"
     doc_attn_layer = "The number of attention layers. Note that model compression of `se_atten` is only enabled when attn_layer==0 and stripped_type_embedding is True"
     doc_attn_dotr = "Whether to do dot product with the normalized relative coordinates"
@@ -432,7 +450,7 @@ def descrpt_se_atten_common_args():
     ]
 
 
-@descrpt_args_plugin.register("se_atten")
+@descrpt_args_plugin.register("se_atten", alias=["dpa1"])
 def descrpt_se_atten_args():
     doc_stripped_type_embedding = "Whether to strip the type embedding into a separated embedding network. Setting it to `False` will fall back to the previous version of `se_atten` which is non-compressible."
     doc_smooth_type_embdding = "When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
@@ -445,22 +463,60 @@ def descrpt_se_atten_args():
             bool,
             optional=True,
             default=False,
-            doc=doc_stripped_type_embedding,
+            doc=doc_only_tf_supported + doc_stripped_type_embedding,
         ),
         Argument(
             "smooth_type_embdding",
             bool,
             optional=True,
             default=False,
-            doc=doc_smooth_type_embdding,
+            doc=doc_only_tf_supported + doc_smooth_type_embdding,
         ),
         Argument(
             "set_davg_zero", bool, optional=True, default=True, doc=doc_set_davg_zero
         ),
+        # pt only
+        Argument("tebd_dim", int, optional=True, default=8, doc=doc_only_pt_supported),
+        Argument(
+            "tebd_input_mode",
+            str,
+            optional=True,
+            default="concat",
+            doc=doc_only_pt_supported,
+        ),
+        Argument(
+            "post_ln", bool, optional=True, default=True, doc=doc_only_pt_supported
+        ),
+        Argument("ffn", bool, optional=True, default=False, doc=doc_only_pt_supported),
+        Argument(
+            "ffn_embed_dim", int, optional=True, default=1024, doc=doc_only_pt_supported
+        ),
+        Argument(
+            "scaling_factor",
+            float,
+            optional=True,
+            default=1.0,
+            doc=doc_only_pt_supported,
+        ),
+        Argument("head_num", int, optional=True, default=1, doc=doc_only_pt_supported),
+        Argument(
+            "normalize", bool, optional=True, default=True, doc=doc_only_pt_supported
+        ),
+        Argument("temperature", float, optional=True, doc=doc_only_pt_supported),
+        Argument(
+            "return_rot", bool, optional=True, default=False, doc=doc_only_pt_supported
+        ),
+        Argument(
+            "concat_output_tebd",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_only_pt_supported,
+        ),
     ]
 
 
-@descrpt_args_plugin.register("se_atten_v2")
+@descrpt_args_plugin.register("se_atten_v2", doc=doc_only_tf_supported)
 def descrpt_se_atten_v2_args():
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
 
@@ -472,12 +528,272 @@ def descrpt_se_atten_v2_args():
     ]
 
 
-@descrpt_args_plugin.register("se_a_ebd_v2", alias=["se_a_tpe_v2"])
+@descrpt_args_plugin.register("dpa2", doc=doc_only_pt_supported)
+def descrpt_dpa2_args():
+    # Generate by GitHub Copilot
+    doc_repinit_rcut = "The cut-off radius of the repinit block"
+    doc_repinit_rcut_smth = "From this position the inverse distance smoothly decays to 0 at the cut-off. Use in the repinit block."
+    doc_repinit_nsel = "Maximally possible number of neighbors for repinit block."
+    doc_repformer_rcut = "The cut-off radius of the repformer block"
+    doc_repformer_rcut_smth = "From this position the inverse distance smoothly decays to 0 at the cut-off. Use in the repformer block."
+    doc_repformer_nsel = "Maximally possible number of neighbors for repformer block."
+    doc_tebd_dim = "The dimension of atom type embedding"
+    doc_concat_output_tebd = (
+        "Whether to concat type embedding at the output of the descriptor."
+    )
+    doc_repinit_neuron = "repinit block: the number of neurons in the embedding net."
+    doc_repinit_axis_neuron = (
+        "repinit block: the number of dimension of split in the symmetrization op."
+    )
+    doc_repinit_activation = (
+        "repinit block: the activation function in the embedding net"
+    )
+    doc_repformer_nlayers = "repformers block: the number of repformer layers"
+    doc_repformer_g1_dim = "repformers block: the dimension of single-atom rep"
+    doc_repformer_g2_dim = "repformers block: the dimension of invariant pair-atom rep"
+    doc_repformer_axis_dim = (
+        "repformers block: the number of dimension of split in the symmetrization ops."
+    )
+    doc_repformer_do_bn_mode = "repformers block: do batch norm in the repformer layers"
+    doc_repformer_bn_momentum = "repformers block: moment in the batch normalization"
+    doc_repformer_update_g1_has_conv = (
+        "repformers block: update the g1 rep with convolution term"
+    )
+    doc_repformer_update_g1_has_drrd = (
+        "repformers block: update the g1 rep with the drrd term"
+    )
+    doc_repformer_update_g1_has_grrg = (
+        "repformers block: update the g1 rep with the grrg term"
+    )
+    doc_repformer_update_g1_has_attn = (
+        "repformers block: update the g1 rep with the localized self-attention"
+    )
+    doc_repformer_update_g2_has_g1g1 = (
+        "repformers block: update the g2 rep with the g1xg1 term"
+    )
+    doc_repformer_update_g2_has_attn = (
+        "repformers block: update the g2 rep with the gated self-attention"
+    )
+    doc_repformer_update_h2 = "repformers block: update the h2 rep"
+    doc_repformer_attn1_hidden = (
+        "repformers block: the hidden dimension of localized self-attention"
+    )
+    doc_repformer_attn1_nhead = (
+        "repformers block: the number of heads in localized self-attention"
+    )
+    doc_repformer_attn2_hidden = (
+        "repformers block: the hidden dimension of gated self-attention"
+    )
+    doc_repformer_attn2_nhead = (
+        "repformers block: the number of heads in gated self-attention"
+    )
+    doc_repformer_attn2_has_gate = (
+        "repformers block: has gate in the gated self-attention"
+    )
+    doc_repformer_activation = "repformers block: the activation function in the MLPs."
+    doc_repformer_update_style = "repformers block: style of update a rep. can be res_avg or res_incr. res_avg updates a rep `u` with: u = 1/\\sqrt{n+1} (u + u_1 + u_2 + ... + u_n) res_incr updates a rep `u` with: u = u + 1/\\sqrt{n} (u_1 + u_2 + ... + u_n)"
+    doc_repformer_set_davg_zero = "repformers block: set the avg to zero in statistics"
+    doc_repformer_add_type_ebd_to_seq = (
+        "repformers block: concatenate the type embedding at the output"
+    )
+    return [
+        Argument("repinit_rcut", float, doc=doc_repinit_rcut),
+        Argument("repinit_rcut_smth", float, doc=doc_repinit_rcut_smth),
+        Argument("repinit_nsel", int, doc=doc_repinit_nsel),
+        Argument("repformer_rcut", float, doc=doc_repformer_rcut),
+        Argument("repformer_rcut_smth", float, doc=doc_repformer_rcut_smth),
+        Argument("repformer_nsel", int, doc=doc_repformer_nsel),
+        Argument("tebd_dim", int, optional=True, default=8, doc=doc_tebd_dim),
+        Argument(
+            "concat_output_tebd",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_concat_output_tebd,
+        ),
+        Argument(
+            "repinit_neuron",
+            list,
+            optional=True,
+            default=[25, 50, 100],
+            doc=doc_repinit_neuron,
+        ),
+        Argument(
+            "repinit_axis_neuron",
+            int,
+            optional=True,
+            default=16,
+            doc=doc_repinit_axis_neuron,
+        ),
+        Argument("repinit_set_davg_zero", bool, optional=True, default=True),
+        Argument(
+            "repinit_activation",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_repinit_activation,
+        ),
+        Argument(
+            "repformer_nlayers",
+            int,
+            optional=True,
+            default=3,
+            doc=doc_repformer_nlayers,
+        ),
+        Argument(
+            "repformer_g1_dim",
+            int,
+            optional=True,
+            default=128,
+            doc=doc_repformer_g1_dim,
+        ),
+        Argument(
+            "repformer_g2_dim", int, optional=True, default=16, doc=doc_repformer_g2_dim
+        ),
+        Argument(
+            "repformer_axis_dim",
+            int,
+            optional=True,
+            default=4,
+            doc=doc_repformer_axis_dim,
+        ),
+        Argument(
+            "repformer_do_bn_mode",
+            str,
+            optional=True,
+            default="no",
+            doc=doc_repformer_do_bn_mode,
+        ),
+        Argument(
+            "repformer_bn_momentum",
+            float,
+            optional=True,
+            default=0.1,
+            doc=doc_repformer_bn_momentum,
+        ),
+        Argument(
+            "repformer_update_g1_has_conv",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g1_has_conv,
+        ),
+        Argument(
+            "repformer_update_g1_has_drrd",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g1_has_drrd,
+        ),
+        Argument(
+            "repformer_update_g1_has_grrg",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g1_has_grrg,
+        ),
+        Argument(
+            "repformer_update_g1_has_attn",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g1_has_attn,
+        ),
+        Argument(
+            "repformer_update_g2_has_g1g1",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g2_has_g1g1,
+        ),
+        Argument(
+            "repformer_update_g2_has_attn",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_update_g2_has_attn,
+        ),
+        Argument(
+            "repformer_update_h2",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_repformer_update_h2,
+        ),
+        Argument(
+            "repformer_attn1_hidden",
+            int,
+            optional=True,
+            default=64,
+            doc=doc_repformer_attn1_hidden,
+        ),
+        Argument(
+            "repformer_attn1_nhead",
+            int,
+            optional=True,
+            default=4,
+            doc=doc_repformer_attn1_nhead,
+        ),
+        Argument(
+            "repformer_attn2_hidden",
+            int,
+            optional=True,
+            default=16,
+            doc=doc_repformer_attn2_hidden,
+        ),
+        Argument(
+            "repformer_attn2_nhead",
+            int,
+            optional=True,
+            default=4,
+            doc=doc_repformer_attn2_nhead,
+        ),
+        Argument(
+            "repformer_attn2_has_gate",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_repformer_attn2_has_gate,
+        ),
+        Argument(
+            "repformer_activation",
+            str,
+            optional=True,
+            default="tanh",
+            doc=doc_repformer_activation,
+        ),
+        Argument(
+            "repformer_update_style",
+            str,
+            optional=True,
+            default="res_avg",
+            doc=doc_repformer_update_style,
+        ),
+        Argument(
+            "repformer_set_davg_zero",
+            bool,
+            optional=True,
+            default=True,
+            doc=doc_repformer_set_davg_zero,
+        ),
+        Argument(
+            "repformer_add_type_ebd_to_seq",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_repformer_add_type_ebd_to_seq,
+        ),
+    ]
+
+
+@descrpt_args_plugin.register(
+    "se_a_ebd_v2", alias=["se_a_tpe_v2"], doc=doc_only_tf_supported
+)
 def descrpt_se_a_ebd_v2_args():
     return descrpt_se_a_args()
 
 
-@descrpt_args_plugin.register("se_a_mask")
+@descrpt_args_plugin.register("se_a_mask", doc=doc_only_tf_supported)
 def descrpt_se_a_mask_args():
     doc_sel = 'This parameter sets the number of selected neighbors for each type of atom. It can be:\n\n\
     - `List[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
@@ -637,7 +953,7 @@ def fitting_ener():
     ]
 
 
-@fitting_args_plugin.register("dos")
+@fitting_args_plugin.register("dos", doc=doc_only_tf_supported)
 def fitting_dos():
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
@@ -684,7 +1000,7 @@ def fitting_dos():
     ]
 
 
-@fitting_args_plugin.register("polar")
+@fitting_args_plugin.register("polar", doc=doc_only_tf_supported)
 def fitting_polar():
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -738,7 +1054,7 @@ def fitting_polar():
 #    return fitting_polar()
 
 
-@fitting_args_plugin.register("dipole")
+@fitting_args_plugin.register("dipole", doc=doc_only_tf_supported)
 def fitting_dipole():
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -900,16 +1216,30 @@ def model_args(exclude_hybrid=False):
                 default=10,
                 doc=doc_data_bias_nsample,
             ),
-            Argument("use_srtab", str, optional=True, doc=doc_use_srtab),
-            Argument("smin_alpha", float, optional=True, doc=doc_smin_alpha),
-            Argument("sw_rmin", float, optional=True, doc=doc_sw_rmin),
-            Argument("sw_rmax", float, optional=True, doc=doc_sw_rmax),
+            Argument(
+                "use_srtab",
+                str,
+                optional=True,
+                doc=doc_only_tf_supported + doc_use_srtab,
+            ),
+            Argument(
+                "smin_alpha",
+                float,
+                optional=True,
+                doc=doc_only_tf_supported + doc_smin_alpha,
+            ),
+            Argument(
+                "sw_rmin", float, optional=True, doc=doc_only_tf_supported + doc_sw_rmin
+            ),
+            Argument(
+                "sw_rmax", float, optional=True, doc=doc_only_tf_supported + doc_sw_rmax
+            ),
             Argument(
                 "srtab_add_bias",
                 bool,
                 optional=True,
                 default=True,
-                doc=doc_srtab_add_bias,
+                doc=doc_only_tf_supported + doc_srtab_add_bias,
             ),
             Argument(
                 "type_embedding",
@@ -917,7 +1247,7 @@ def model_args(exclude_hybrid=False):
                 type_embedding_args(),
                 [],
                 optional=True,
-                doc=doc_type_embedding,
+                doc=doc_only_tf_supported + doc_type_embedding,
             ),
             Argument(
                 "modifier",
@@ -925,7 +1255,7 @@ def model_args(exclude_hybrid=False):
                 [],
                 [modifier_variant_type_args()],
                 optional=True,
-                doc=doc_modifier,
+                doc=doc_only_tf_supported + doc_modifier,
             ),
             Argument(
                 "compress",
@@ -933,7 +1263,7 @@ def model_args(exclude_hybrid=False):
                 [],
                 [model_compression_type_args()],
                 optional=True,
-                doc=doc_compress_config,
+                doc=doc_only_tf_supported + doc_compress_config,
                 fold_subdoc=True,
             ),
             Argument("spin", dict, spin_args(), [], optional=True, doc=doc_spin),
@@ -997,7 +1327,7 @@ def multi_model_args() -> Argument:
             ),
             Argument("fitting_net_dict", dict, doc=doc_fitting_net_dict),
         ],
-        doc="Multiple-task model.",
+        doc=doc_only_tf_supported + "Multiple-task model.",
     )
     return ca
 
@@ -1016,6 +1346,7 @@ def pairwise_dprc() -> Argument:
             qm_model_args,
             qmmm_model_args,
         ],
+        doc=doc_only_tf_supported,
     )
     return ca
 
@@ -1028,6 +1359,7 @@ def frozen_model_args() -> Argument:
         [
             Argument("model_file", str, optional=False, doc=doc_model_file),
         ],
+        doc=doc_only_tf_supported,
     )
     return ca
 
@@ -1047,7 +1379,7 @@ def pairtab_model_args() -> Argument:
             Argument("rcut", float, optional=False, doc=doc_rcut),
             Argument("sel", [int, List[int], str], optional=False, doc=doc_sel),
         ],
-        doc="Pairwise tabulation energy model.",
+        doc=doc_only_tf_supported + "Pairwise tabulation energy model.",
     )
     return ca
 
@@ -1076,6 +1408,7 @@ def linear_ener_model_args() -> Argument:
                 doc=doc_weights,
             ),
         ],
+        doc=doc_only_tf_supported,
     )
     return ca
 
@@ -1390,7 +1723,7 @@ def loss_ener_spin():
     ]
 
 
-@loss_args_plugin.register("dos")
+@loss_args_plugin.register("dos", doc=doc_only_tf_supported)
 def loss_dos():
     doc_start_pref_dos = start_pref("Density of State (DOS)")
     doc_limit_pref_dos = limit_pref("Density of State (DOS)")
@@ -1465,7 +1798,7 @@ def loss_dos():
 
 
 # YWolfeee: Modified to support tensor type of loss args.
-@loss_args_plugin.register("tensor")
+@loss_args_plugin.register("tensor", doc=doc_only_tf_supported)
 def loss_tensor():
     # doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If only `pref` is provided or both are not provided, training will be global mode, i.e. the shape of 'polarizability.npy` or `dipole.npy` should be #frams x [9 or 3]."
     # doc_local_weight =  "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If only `pref_atomic` is provided, training will be atomic mode, i.e. the shape of `polarizability.npy` or `dipole.npy` should be #frames x ([9 or 3] x #selected atoms). If both `pref` and `pref_atomic` are provided, training will be combined mode, and atomic label should be provided as well."
@@ -1746,13 +2079,19 @@ def training_args():  # ! modified by Ziyao: data configuration isolated.
         Argument(
             "time_training", bool, optional=True, default=True, doc=doc_time_training
         ),
-        Argument("profiling", bool, optional=True, default=False, doc=doc_profiling),
+        Argument(
+            "profiling",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_only_tf_supported + doc_profiling,
+        ),
         Argument(
             "profiling_file",
             str,
             optional=True,
             default="timeline.json",
-            doc=doc_profiling_file,
+            doc=doc_only_tf_supported + doc_profiling_file,
         ),
         Argument(
             "enable_profiler",
@@ -1776,10 +2115,38 @@ def training_args():  # ! modified by Ziyao: data configuration isolated.
         ),
         Argument("data_dict", dict, optional=True, doc=doc_data_dict),
         Argument("fitting_weight", dict, optional=True, doc=doc_fitting_weight),
+        Argument("warmup_steps", int, optional=True, doc=doc_only_pt_supported),
+        Argument("gradient_max_norm", float, optional=True, doc=doc_only_pt_supported),
+        Argument("stat_file", str, optional=True, doc=doc_only_pt_supported),
+    ]
+    variants = [
+        Variant(
+            "opt_type",
+            choices=[
+                Argument("Adam", dict, [], [], optional=True),
+                Argument(
+                    "LKF",
+                    dict,
+                    [
+                        Argument(
+                            "kf_blocksize",
+                            int,
+                            optional=True,
+                            doc=doc_only_pt_supported,
+                        ),
+                    ],
+                    [],
+                    optional=True,
+                ),
+            ],
+            optional=True,
+            default_tag="Adam",
+            doc=doc_only_pt_supported,
+        )
     ]
 
     doc_training = "The training options."
-    return Argument("training", dict, args, [], doc=doc_training)
+    return Argument("training", dict, args, variants, doc=doc_training)
 
 
 def make_index(keys):
diff --git a/examples/water/se_atten/input_torch.json b/examples/water/se_atten/input_torch.json
index bc948cc2a0..7e9cf06f35 100644
--- a/examples/water/se_atten/input_torch.json
+++ b/examples/water/se_atten/input_torch.json
@@ -17,6 +17,7 @@
       ],
       "tebd_dim": 8,
       "axis_neuron": 16,
+      "type_one_side": true,
       "attn": 128,
       "attn_layer": 2,
       "attn_dotr": true,
@@ -24,7 +25,7 @@
       "post_ln": true,
       "ffn": false,
       "ffn_embed_dim": 1024,
-      "activation": "tanh",
+      "activation_function": "tanh",
       "scaling_factor": 1.0,
       "head_num": 1,
       "normalize": true,
@@ -78,11 +79,6 @@
       "numb_btch": 3,
       "_comment": "that's all"
     },
-    "wandb_config": {
-      "wandb_enabled": false,
-      "entity": "dp_model_engineering",
-      "project": "DPA"
-    },
     "numb_steps": 1000000,
     "seed": 10,
     "disp_file": "lcurve.out",
diff --git a/source/tests/common/test_examples.py b/source/tests/common/test_examples.py
index ad06925eab..49abcf2f90 100644
--- a/source/tests/common/test_examples.py
+++ b/source/tests/common/test_examples.py
@@ -42,6 +42,9 @@
     p_examples / "dprc" / "normal" / "input.json",
     p_examples / "dprc" / "pairwise" / "input.json",
     p_examples / "dprc" / "generalized_force" / "input.json",
+    p_examples / "water" / "se_e2_a" / "input_torch.json",
+    p_examples / "water" / "se_atten" / "input_torch.json",
+    p_examples / "water" / "dpa2" / "input_torch.json",
 )
 
 
diff --git a/source/tests/pt/model/models/dpa1.json b/source/tests/pt/model/models/dpa1.json
index dd838ac692..5d2c65c214 100644
--- a/source/tests/pt/model/models/dpa1.json
+++ b/source/tests/pt/model/models/dpa1.json
@@ -21,7 +21,7 @@
     "post_ln": true,
     "ffn": false,
     "ffn_embed_dim": 10,
-    "activation": "tanh",
+    "activation_function": "tanh",
     "scaling_factor": 1.0,
     "head_num": 1,
     "normalize": true,
diff --git a/source/tests/pt/model/models/dpa2_hyb.json b/source/tests/pt/model/models/dpa2_hyb.json
index b5d53b0246..ee69ed4d69 100644
--- a/source/tests/pt/model/models/dpa2_hyb.json
+++ b/source/tests/pt/model/models/dpa2_hyb.json
@@ -25,7 +25,7 @@
         "post_ln": true,
         "ffn": false,
         "ffn_embed_dim": 10,
-        "activation": "tanh",
+        "activation_function": "tanh",
         "scaling_factor": 1.0,
         "head_num": 1,
         "normalize": true,
diff --git a/source/tests/pt/model/test_jit.py b/source/tests/pt/model/test_jit.py
index f13dade183..a1aa9658fc 100644
--- a/source/tests/pt/model/test_jit.py
+++ b/source/tests/pt/model/test_jit.py
@@ -85,15 +85,15 @@ def setUp(self):
         self.config["training"]["training_data"]["systems"] = data_file
         self.config["training"]["validation_data"]["systems"] = data_file
         self.config["model"] = deepcopy(model_dpa2)
-        self.config["model"]["descriptor"]["rcut"] = self.config["model"]["descriptor"][
-            "repinit_rcut"
-        ]
-        self.config["model"]["descriptor"]["rcut_smth"] = self.config["model"][
-            "descriptor"
-        ]["repinit_rcut_smth"]
-        self.config["model"]["descriptor"]["sel"] = self.config["model"]["descriptor"][
-            "repinit_nsel"
-        ]
+        # self.config["model"]["descriptor"]["rcut"] = self.config["model"]["descriptor"][
+        #     "repinit_rcut"
+        # ]
+        # self.config["model"]["descriptor"]["rcut_smth"] = self.config["model"][
+        #     "descriptor"
+        # ]["repinit_rcut_smth"]
+        # self.config["model"]["descriptor"]["sel"] = self.config["model"]["descriptor"][
+        #     "repinit_nsel"
+        # ]
         self.config["training"]["numb_steps"] = 10
         self.config["training"]["save_freq"] = 10
 
diff --git a/source/tests/pt/model/test_permutation.py b/source/tests/pt/model/test_permutation.py
index b97cb349ad..45790bf43d 100644
--- a/source/tests/pt/model/test_permutation.py
+++ b/source/tests/pt/model/test_permutation.py
@@ -115,12 +115,13 @@
         "post_ln": True,
         "ffn": False,
         "ffn_embed_dim": 512,
-        "activation": "tanh",
+        "activation_function": "tanh",
         "scaling_factor": 1.0,
         "head_num": 1,
         "normalize": False,
         "temperature": 1.0,
         "set_davg_zero": True,
+        "type_one_side": True,
     },
     "fitting_net": {
         "neuron": [24, 24, 24],
@@ -149,7 +150,7 @@
                 "post_ln": True,
                 "ffn": False,
                 "ffn_embed_dim": 1024,
-                "activation": "tanh",
+                "activation_function": "tanh",
                 "scaling_factor": 1.0,
                 "head_num": 1,
                 "normalize": True,
diff --git a/source/tests/pt/model/water/se_atten.json b/source/tests/pt/model/water/se_atten.json
index 3ed80ae892..6b6fca50d3 100644
--- a/source/tests/pt/model/water/se_atten.json
+++ b/source/tests/pt/model/water/se_atten.json
@@ -16,6 +16,7 @@
         100
       ],
       "axis_neuron": 16,
+      "type_one_side": true,
       "attn": 64,
       "attn_layer": 2,
       "attn_dotr": true,
@@ -23,7 +24,7 @@
       "post_ln": true,
       "ffn": false,
       "ffn_embed_dim": 512,
-      "activation": "tanh",
+      "activation_function": "tanh",
       "scaling_factor": 1.0,
       "head_num": 1,
       "normalize": false,
diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py
index 2186467788..f86691cde6 100644
--- a/source/tests/pt/test_training.py
+++ b/source/tests/pt/test_training.py
@@ -79,15 +79,15 @@ def setUp(self):
         self.config["training"]["training_data"]["systems"] = data_file
         self.config["training"]["validation_data"]["systems"] = data_file
         self.config["model"] = deepcopy(model_dpa2)
-        self.config["model"]["descriptor"]["rcut"] = self.config["model"]["descriptor"][
-            "repinit_rcut"
-        ]
-        self.config["model"]["descriptor"]["rcut_smth"] = self.config["model"][
-            "descriptor"
-        ]["repinit_rcut_smth"]
-        self.config["model"]["descriptor"]["sel"] = self.config["model"]["descriptor"][
-            "repinit_nsel"
-        ]
+        # self.config["model"]["descriptor"]["rcut"] = self.config["model"]["descriptor"][
+        #     "repinit_rcut"
+        # ]
+        # self.config["model"]["descriptor"]["rcut_smth"] = self.config["model"][
+        #     "descriptor"
+        # ]["repinit_rcut_smth"]
+        # self.config["model"]["descriptor"]["sel"] = self.config["model"]["descriptor"][
+        #     "repinit_nsel"
+        # ]
         self.config["training"]["numb_steps"] = 1
         self.config["training"]["save_freq"] = 1