diff --git a/deepmd/dpmodel/model/model.py b/deepmd/dpmodel/model/model.py
index 6dea9041fc..1a883452af 100644
--- a/deepmd/dpmodel/model/model.py
+++ b/deepmd/dpmodel/model/model.py
@@ -36,6 +36,10 @@ def get_standard_model(data: dict) -> EnergyModel:
     data : dict
         The data to construct the model.
     """
+    if "type_embedding" in data:
+        raise ValueError(
+            "In the DP backend, type_embedding is not at the model level, but within the descriptor. See type embedding documentation for details."
+        )
     data["descriptor"]["type_map"] = data["type_map"]
     data["descriptor"]["ntypes"] = len(data["type_map"])
     fitting_type = data["fitting_net"].pop("type")
diff --git a/deepmd/jax/model/model.py b/deepmd/jax/model/model.py
index 983815100c..8b7d375841 100644
--- a/deepmd/jax/model/model.py
+++ b/deepmd/jax/model/model.py
@@ -35,6 +35,10 @@ def get_standard_model(data: dict):
         The data to construct the model.
     """
     data = deepcopy(data)
+    if "type_embedding" in data:
+        raise ValueError(
+            "In the JAX backend, type_embedding is not at the model level, but within the descriptor. See type embedding documentation for details."
+        )
     descriptor_type = data["descriptor"].pop("type")
     data["descriptor"]["type_map"] = data["type_map"]
     data["descriptor"]["ntypes"] = len(data["type_map"])
diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
index 03896f3e59..2a6fcffe7e 100644
--- a/deepmd/pt/model/model/__init__.py
+++ b/deepmd/pt/model/model/__init__.py
@@ -73,6 +73,10 @@
 
 
 def _get_standard_model_components(model_params, ntypes):
+    if "type_embedding" in model_params:
+        raise ValueError(
+            "In the PyTorch backend, type_embedding is not at the model level, but within the descriptor. See type embedding documentation for details."
+        )
     # descriptor
     model_params["descriptor"]["ntypes"] = ntypes
     model_params["descriptor"]["type_map"] = copy.deepcopy(model_params["type_map"])
diff --git a/deepmd/tf/model/model.py b/deepmd/tf/model/model.py
index 5e3f99bc2d..7b61fb6690 100644
--- a/deepmd/tf/model/model.py
+++ b/deepmd/tf/model/model.py
@@ -841,6 +841,10 @@ def serialize(self, suffix: str = "") -> dict:
             Name suffix to identify this descriptor
         """
         if self.typeebd is not None:
+            if not self.descrpt.explicit_ntypes:
+                raise RuntimeError(
+                    "type embedding for descriptors without mixed types is not supported in other backends"
+                )
             self.descrpt.type_embedding = self.typeebd
             self.fitting.tebd_dim = self.typeebd.neuron[-1]
         if self.spin is not None:
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 3c4a4e6204..a37d3e646a 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -1772,7 +1772,7 @@ def model_args(exclude_hybrid=False):
     doc_data_stat_nbatch = "The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics."
     doc_data_stat_protect = "Protect parameter for atomic energy regression."
     doc_data_bias_nsample = "The number of training samples in a system to compute and change the energy bias."
-    doc_type_embedding = "The type embedding."
+    doc_type_embedding = "The type embedding. In other backends, the type embedding is already included in the descriptor."
     doc_modifier = "The modifier of model output."
     doc_use_srtab = "The table for the short-range pairwise interaction added on top of DP. The table is a text data file with (N_t + 1) * N_t / 2 + 1 columes. The first colume is the distance between atoms. The second to the last columes are energies for pairs of certain types. For example we have two atom types, 0 and 1. The columes from 2nd to 4th are for 0-0, 0-1 and 1-1 correspondingly."
     doc_smin_alpha = "The short-range tabulated interaction will be switched according to the distance of the nearest neighbor. This distance is calculated by softmin. This parameter is the decaying parameter in the softmin. It is only required when `use_srtab` is provided."
diff --git a/doc/data/system.md b/doc/data/system.md
index b5db516726..b50c6fa256 100644
--- a/doc/data/system.md
+++ b/doc/data/system.md
@@ -1,6 +1,6 @@
 # System
 
-DeePMD-kit takes a **system** as the data structure. A snapshot of a system is called a **frame**. A system may contain multiple frames with the same atom types and numbers, i.e. the same formula (like `H2O`). To contains data with different formulas, one usually needs to divide data into multiple systems, which may sometimes result in sparse-frame systems. See a [new system format](../model/train-se-atten.md#data-format) to further combine different systems with the same atom numbers, when training with descriptor `se_atten`.
+DeePMD-kit takes a **system** as the data structure. A snapshot of a system is called a **frame**. A system may contain multiple frames with the same atom types and numbers, i.e. the same formula (like `H2O`). To contains data with different formulas, one usually needs to divide data into multiple systems, which may sometimes result in sparse-frame systems.
 
 A system should contain system properties, input frame properties, and labeled frame properties. The system property contains the following property:
 
diff --git a/doc/freeze/compress.md b/doc/freeze/compress.md
index 4f30458df1..9fa8cd4b3a 100644
--- a/doc/freeze/compress.md
+++ b/doc/freeze/compress.md
@@ -112,9 +112,8 @@ The model compression interface requires the version of DeePMD-kit used in the o
 
 **Acceptable descriptor type**
 
-Descriptors with `se_e2_a`, `se_e3`, `se_e2_r` and `se_atten_v2` types are supported by the model compression feature. `Hybrid` mixed with the above descriptors is also supported.
-
-Notice: Model compression for the `se_atten_v2` descriptor is exclusively designed for models with the training parameter {ref}`attn_layer <model[standard]/descriptor[se_atten_v2]/attn_layer>` set to 0.
+Not any descriptor supports model compression.
+See the documentation of a specific descriptor to see whether it supports model compression.
 
 **Available activation functions for descriptor:**
 
diff --git a/doc/model/dpa2.md b/doc/model/dpa2.md
index 27ffc1b14d..a4f1e8ad04 100644
--- a/doc/model/dpa2.md
+++ b/doc/model/dpa2.md
@@ -21,3 +21,12 @@ otherwise the communication between GPU cards falls back to the slower CPU imple
 ## Data format
 
 DPA-2 supports both the [standard data format](../data/system.md) and the [mixed type data format](../data/system.md#mixed-type).
+
+## Tyoe embedding
+
+Type embedding is within this descriptor with the {ref}`tebd_dim <model[standard]/descriptor[dpa2]/tebd_dim>` argument.
+
+## Model compression
+
+Model compression is supported, but only the `repinit` part is compressed.
+The effect is limited.
diff --git a/doc/model/train-hybrid.md b/doc/model/train-hybrid.md
index da3b40487b..3c9fff557e 100644
--- a/doc/model/train-hybrid.md
+++ b/doc/model/train-hybrid.md
@@ -48,3 +48,13 @@ A complete training input script of this example can be found in the directory
 ```bash
 $deepmd_source_dir/examples/water/hybrid/input.json
 ```
+
+## Tyoe embedding
+
+Type embedding is different between the TensorFlow backend and other backends.
+In the TensorFlow backend, all descriptors share the same descriptor that defined in the model level.
+In other backends, each descriptor has its own type embedding and their parameters may be different.
+
+## Model compression
+
+Model compression is supported if all sub-descriptors support model compression.
diff --git a/doc/model/train-se-a-mask.md b/doc/model/train-se-a-mask.md
index 93edfc999e..a26ca8e46e 100644
--- a/doc/model/train-se-a-mask.md
+++ b/doc/model/train-se-a-mask.md
@@ -84,3 +84,11 @@ And the `loss` section in the training input script should be set as follows.
     "_comment": " that's all"
   }
 ```
+
+## Tyoe embedding
+
+Same as [`se_e2_a`](./train-se-e2-a.md).
+
+## Model compression
+
+Same as [`se_e2_a`](./train-se-e2-a.md).
diff --git a/doc/model/train-se-atten.md b/doc/model/train-se-atten.md
index 3e88a4e950..d8d7c3c98c 100644
--- a/doc/model/train-se-atten.md
+++ b/doc/model/train-se-atten.md
@@ -4,8 +4,6 @@
 **Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
-## DPA-1: Pretraining of Attention-based Deep Potential Model for Molecular Simulation
-
 ![ALT](../images/model_se_atten.png "model_se_atten")
 
 Here we propose DPA-1, a Deep Potential model with a novel attention mechanism, which is highly effective for representing the conformation and chemical spaces of atomic systems and learning the PES.
@@ -68,11 +66,9 @@ Then layer normalization is added in a residual way to finally obtain the self-a
 
 [^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
 
-## Introduction to new features of DPA-1
-
-Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found [here](../../examples/water/se_atten/input.json).
+## Descriptor `"se_atten"`
 
-### Descriptor `"se_atten"`
+Next, we will list the detailed settings in input.json and the data format, especially for large systems with dozens of elements. An example of DPA-1 input can be found in `examples/water/se_atten/input.json`.
 
 The notation of `se_atten` is short for the smooth edition of Deep Potential with an attention mechanism.
 This descriptor was described in detail in [the DPA-1 paper](https://arxiv.org/abs/2208.08236) and the images above.
@@ -126,7 +122,7 @@ We highly recommend using the version 2.0 of the attention-based descriptor `"se
       "set_davg_zero": false
 ```
 
-You can use descriptor `"se_atten_v2"` and do not need to set `tebd_input_mode` and `smooth_type_embedding`. In `"se_atten_v2"`, `tebd_input_mode` is forced to be `"strip"` and `smooth_type_embedding` is forced to be `"true"`. When `tebd_input_mode` is `"strip"`, the embedding matrix $\mathcal{G}^i$ is constructed as:
+You can use descriptor `"se_atten_v2"` and is not allowed to set `tebd_input_mode` and `smooth_type_embedding`. In `"se_atten_v2"`, `tebd_input_mode` is forced to be `"strip"` and `smooth_type_embedding` is forced to be `"true"`. When `tebd_input_mode` is `"strip"`, the embedding matrix $\mathcal{G}^i$ is constructed as:
 
 ```math
    (\mathcal{G}^i)_j = \mathcal{N}_{e,2}(s(r_{ij})) + \mathcal{N}_{e,2}(s(r_{ij})) \odot ({N}_{e,2}(\{\mathcal{A}^i, \mathcal{A}^j\}) \odot s(r_{ij})) \quad \mathrm{or}
@@ -140,25 +136,28 @@ Practical evidence demonstrates that `"se_atten_v2"` offers better and more stab
 
 Notice: Model compression for the `se_atten_v2` descriptor is exclusively designed for models with the training parameter {ref}`attn_layer <model[standard]/descriptor[se_atten_v2]/attn_layer>` set to 0.
 
-### Fitting `"ener"`
+## Type embedding
 
-DPA-1 only supports `"ener"` fitting type, and you can refer [here](train-energy.md) for detailed information.
+DPA-1 only supports models with type embeddings.
 
-### Type embedding
-
-DPA-1 only supports models with type embeddings. And the default setting is as follows:
+In the TensorFlow backend, the {ref}`type_embedding <model/type_embedding>` section will be used. If it is not set, the following default parameters will be used:
 
 ```json
 "type_embedding":{
-            "neuron":           [8],
-            "resnet_dt":        false,
-            "seed":             1
-        }
+    "neuron":           [8],
+    "resnet_dt":        false,
+    "seed":             1
+}
 ```
 
-You can add these settings in input.json if you want to change the default ones, see [here](train-se-e2-a-tebd.md) for detailed information.
+In other backends, type embedding is within this descriptor with the {ref}`tebd_dim <model[standard]/descriptor[se_atten_v2]/tebd_dim>` argument.
+
+## Difference between TensorFlow and other backends
 
-### Type map
+TensorFlow and other backends have different implementations for {ref}`smooth_type_embedding <model[standard]/descriptor[se_atten_v2]/smooth_type_embedding>`.
+The results are inconsistent when `smooth_type_embedding` is `true`.
+
+## Type map
 
 For training large systems, especially those with dozens of elements, the {ref}`type <model/type_map>` determines the element index of training data:
 
@@ -176,6 +175,10 @@ which should include all the elements in the dataset you want to train on.
 
 DPA-1 supports both the [standard data format](../data/system.md) and the [mixed type data format](../data/system.md#mixed-type).
 
+## Model compression
+
+Model compression is supported only when there is no attention layer (`attn_layer` is 0) and `tebd_input_mode` is `strip`.
+
 ## Training example
 
 Here we upload the AlMgCu example shown in the paper, you can download it here:
diff --git a/doc/model/train-se-e2-a-tebd.md b/doc/model/train-se-e2-a-tebd.md
index a6291bb238..00726c0d3e 100644
--- a/doc/model/train-se-e2-a-tebd.md
+++ b/doc/model/train-se-e2-a-tebd.md
@@ -1,7 +1,7 @@
-# Type embedding approach {{ tensorflow_icon }}
+# Type embedding approach {{ tensorflow_icon }} {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
 
 :::{note}
-**Supported backends**: TensorFlow {{ tensorflow_icon }}
+**Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
 :::
 
 We generate specific a type embedding vector for each atom type so that we can share one descriptor embedding net and one fitting net in total, which decline training complexity largely.
@@ -63,8 +63,9 @@ In this way, all chemical species share the same network parameters through the
 
 [^1]: This section is built upon Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang, [J. Chem. Phys. 159, 054801 (2023)](https://doi.org/10.1063/5.0155600) licensed under a [Creative Commons Attribution (CC BY) license](http://creativecommons.org/licenses/by/4.0/).
 
-## Instructions
+## Instructions for TensorFlow backend {{ tensorflow_icon }}
 
+In the TensorFlow backend, the type embedding is at the model level.
 The {ref}`model <model>` defines how the model is constructed, adding a section of type embedding net:
 
 ```json
@@ -106,6 +107,10 @@ $deepmd_source_dir/examples/water/se_e2_a_tebd/input.json
 
 See [here](../development/type-embedding.md) for further explanation of `type embedding`.
 
-:::{note}
-You can't apply the compression method while using the atom type embedding.
-:::
+See documentation for each descriptor for details.
+
+## Instructions for other backends
+
+In other backends, the type embedding is within the descriptor itself.
+
+See documentation for each descriptor for details.
diff --git a/doc/model/train-se-e2-a.md b/doc/model/train-se-e2-a.md
index d4a4510a31..2cc537e349 100644
--- a/doc/model/train-se-e2-a.md
+++ b/doc/model/train-se-e2-a.md
@@ -94,3 +94,13 @@ The construction of the descriptor is given by section {ref}`descriptor <model[s
 - The {ref}`axis_neuron <model[standard]/descriptor[se_e2_a]/axis_neuron>` specifies the size of the submatrix of the embedding matrix, the axis matrix as explained in the [DeepPot-SE paper](https://arxiv.org/abs/1805.09003)
 - If the option {ref}`resnet_dt <model[standard]/descriptor[se_e2_a]/resnet_dt>` is set to `true`, then a timestep is used in the ResNet.
 - {ref}`seed <model[standard]/descriptor[se_e2_a]/seed>` gives the random seed that is used to generate random numbers when initializing the model parameters.
+
+## Type embedding support
+
+Type embdding is only supported in the TensorFlow backends.
+`se_e2_a` with type embedding and [`se_atten`](./train-se-atten.md) (or its updated version) without any attention layer are mathematically equivalent, so `se_atten` can be a substitute in other backends.
+
+## Model compression
+
+Model compression is supported when type embedding is not used.
+To use model compression with type embedding in the TensorFlow backend, use `se_a_tebd_v2` instead.
diff --git a/doc/model/train-se-e2-r.md b/doc/model/train-se-e2-r.md
index baff6d6331..dbaf8bf364 100644
--- a/doc/model/train-se-e2-r.md
+++ b/doc/model/train-se-e2-r.md
@@ -69,3 +69,11 @@ The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.m
 ```
 
 The type of the descriptor is set by the key {ref}`type <model[standard]/descriptor/type>`.
+
+## Type embedding support
+
+Type embdding is only supported in the TensorFlow backends.
+
+## Model compression
+
+Model compression is supported when type embedding is not used.
diff --git a/doc/model/train-se-e3-tebd.md b/doc/model/train-se-e3-tebd.md
index 49d0d80f42..9f9bb70161 100644
--- a/doc/model/train-se-e3-tebd.md
+++ b/doc/model/train-se-e3-tebd.md
@@ -76,3 +76,11 @@ The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.m
 ```
 
 The type of the descriptor is set by the key {ref}`type <model[standard]/descriptor/type>`.
+
+## Tyoe embedding
+
+Type embedding is within this descriptor with the {ref}`tebd_dim <model[standard]/descriptor[se_e3_tebd]/tebd_dim>` argument.
+
+## Model compression
+
+Model compression is not supported.
diff --git a/doc/model/train-se-e3.md b/doc/model/train-se-e3.md
index 714d75259a..380973e5d9 100644
--- a/doc/model/train-se-e3.md
+++ b/doc/model/train-se-e3.md
@@ -64,3 +64,11 @@ The training input script is very similar to that of [`se_e2_a`](train-se-e2-a.m
 ```
 
 The type of the descriptor is set by the key {ref}`type <model[standard]/descriptor/type>`.
+
+## Tyoe embedding
+
+Use [`se_e3_tebd`](./train-se-e3-tebd.md) for type embedding support.
+
+## Model compression
+
+Model compression is supported.
diff --git a/examples/water/dpa2/input_torch_large.json b/examples/water/dpa2/input_torch_large.json
index 568cbc1a94..8ca443d9f5 100644
--- a/examples/water/dpa2/input_torch_large.json
+++ b/examples/water/dpa2/input_torch_large.json
@@ -19,7 +19,7 @@
         ],
         "axis_neuron": 12,
         "activation_function": "tanh",
-        "three_body_sel": 40,
+        "three_body_sel": 47,
         "three_body_rcut": 4.0,
         "three_body_rcut_smth": 3.5,
         "use_three_body": true
@@ -27,7 +27,7 @@
       "repformer": {
         "rcut": 4.0,
         "rcut_smth": 3.5,
-        "nsel": 40,
+        "nsel": 47,
         "nlayers": 12,
         "g1_dim": 128,
         "g2_dim": 32,
diff --git a/examples/water/dpa2/input_torch_medium.json b/examples/water/dpa2/input_torch_medium.json
index 5b739e6f27..9d249788a4 100644
--- a/examples/water/dpa2/input_torch_medium.json
+++ b/examples/water/dpa2/input_torch_medium.json
@@ -19,7 +19,7 @@
         ],
         "axis_neuron": 12,
         "activation_function": "tanh",
-        "three_body_sel": 40,
+        "three_body_sel": 47,
         "three_body_rcut": 4.0,
         "three_body_rcut_smth": 3.5,
         "use_three_body": true
@@ -27,7 +27,7 @@
       "repformer": {
         "rcut": 4.0,
         "rcut_smth": 3.5,
-        "nsel": 40,
+        "nsel": 47,
         "nlayers": 6,
         "g1_dim": 128,
         "g2_dim": 32,
diff --git a/examples/water/dpa2/input_torch_small.json b/examples/water/dpa2/input_torch_small.json
index 98147030b6..e7b644ba30 100644
--- a/examples/water/dpa2/input_torch_small.json
+++ b/examples/water/dpa2/input_torch_small.json
@@ -19,7 +19,7 @@
         ],
         "axis_neuron": 12,
         "activation_function": "tanh",
-        "three_body_sel": 40,
+        "three_body_sel": 47,
         "three_body_rcut": 4.0,
         "three_body_rcut_smth": 3.5,
         "use_three_body": true
@@ -27,7 +27,7 @@
       "repformer": {
         "rcut": 4.0,
         "rcut_smth": 3.5,
-        "nsel": 40,
+        "nsel": 47,
         "nlayers": 3,
         "g1_dim": 128,
         "g2_dim": 32,