diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py
index c29b667143..d22db2d41b 100644
--- a/deepmd/descriptor/se_a.py
+++ b/deepmd/descriptor/se_a.py
@@ -147,6 +147,7 @@ def __init__(
             )
         self.sel_a = sel
         self.rcut_r = rcut
+        self.register_buffer("buffer_rcut", paddle.to_tensor(rcut, dtype="float64"))
         self.rcut_r_smth = rcut_smth
         self.filter_neuron = neuron
         self.n_axis_neuron = axis_neuron
@@ -175,10 +176,12 @@ def __init__(
             self.sel_a.extend(self.sel_a_spin)
         else:
             self.ntypes_spin = 0
+        self.register_buffer("buffer_ntypes_spin", paddle.to_tensor(self.ntypes_spin))
 
         # descrpt config
         self.sel_r = [0 for ii in range(len(self.sel_a))]
         self.ntypes = len(self.sel_a)
+        self.register_buffer("buffer_ntypes", paddle.to_tensor(self.ntypes))
         assert self.ntypes == len(self.sel_r)
         self.rcut_a = -1
         # numb of neighbors and numb of descrptors
diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py
index 121b6c77a6..3582c3eb96 100755
--- a/deepmd/entrypoints/freeze.py
+++ b/deepmd/entrypoints/freeze.py
@@ -424,7 +424,20 @@ def freeze_graph(
             False,
         ],
     )
-    paddle.jit.save(st_model, output)
+    print(f"st_model.descrpt.buffer_rcut.name = {st_model.descrpt.buffer_rcut.name}")
+    print(
+        f"st_model.descrpt.buffer_ntypes.name = {st_model.descrpt.buffer_ntypes.name}"
+    )
+    print(
+        f"st_model.fitting.buffer_dfparam.name = {st_model.fitting.buffer_dfparam.name}"
+    )
+    print(
+        f"st_model.fitting.buffer_daparam.name = {st_model.fitting.buffer_daparam.name}"
+    )
+    # 跳过对program的裁剪，从而保留rcut、ntypes等不参与前向的参数，从而在C++端可以获取这些参数
+    skip_prune_program = True
+    print(f"==>> skip_prune_program = {skip_prune_program}")
+    paddle.jit.save(st_model, output, skip_prune_program=skip_prune_program)
     print(f"Saved to path: {output}")
 
 
diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py
index 036bdc54f9..a13ddec13d 100644
--- a/deepmd/fit/ener.py
+++ b/deepmd/fit/ener.py
@@ -140,7 +140,9 @@ def __init__(
         #        .add("precision",           str, default = "default")\
         #        .add("trainable",        [list, bool], default = True)
         self.numb_fparam = numb_fparam
+        self.register_buffer("buffer_dfparam", paddle.to_tensor(self.numb_fparam))
         self.numb_aparam = numb_aparam
+        self.register_buffer("buffer_daparam", paddle.to_tensor(self.numb_aparam))
         self.n_neuron = neuron
         self.resnet_dt = resnet_dt
         self.rcond = rcond
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index 6d41b91506..7b0b0d5536 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -94,17 +94,23 @@ def __init__(
         for k, v in load_state_dict.items():
             if k in self.model.state_dict():
                 if load_state_dict[k].dtype != self.model.state_dict()[k].dtype:
-                    # print(f"convert dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}")
+                    print(
+                        f"convert {k}'s dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}"
+                    )
                     load_state_dict[k] = load_state_dict[k].astype(
                         self.model.state_dict()[k].dtype
                     )
                 if list(load_state_dict[k].shape) != list(
                     self.model.state_dict()[k].shape
                 ):
-                    # print(f"convert shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}")
+                    print(
+                        f"convert {k}'s shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}"
+                    )
                     load_state_dict[k] = load_state_dict[k].reshape(
                         self.model.state_dict()[k].shape
                     )
+        # print(f"==>> Load pretraied model successfully from: {str(model_file)}")
+        # exit()
         self.model.set_state_dict(load_state_dict)
         self.load_prefix = load_prefix
 
diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py
index 909a9d23ac..b0ade1fc1a 100644
--- a/deepmd/infer/deep_pot.py
+++ b/deepmd/infer/deep_pot.py
@@ -636,6 +636,17 @@ def _eval_inner(
                 eval_inputs["box"],  # [45] paddle.float64
                 eval_inputs["default_mesh"],  # [6] paddle.int32
             )
+            # print(eval_inputs["coord"].shape)
+            # print(eval_inputs["type"].shape)
+            # print(eval_inputs["natoms_vec"].shape)
+            # print(eval_inputs["box"].shape)
+            # print(eval_inputs["default_mesh"].shape)
+            # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/coord.npy", eval_inputs["coord"].numpy())
+            # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/type.npy", eval_inputs["type"].numpy())
+            # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/natoms_vec.npy", eval_inputs["natoms_vec"].numpy())
+            # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/box.npy", eval_inputs["box"].numpy())
+            # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/default_mesh.npy", eval_inputs["default_mesh"].numpy())
+            # exit()
             eval_outputs = {
                 "atom_ener": eval_outputs[0],
                 "atom_virial": eval_outputs[1],
@@ -656,6 +667,12 @@ def _eval_inner(
                 # "xx1": eval_outputs[9],
                 # "hidden1": eval_outputs[10],
             }
+
+            # for k, v in eval_outputs.items():
+            #     print(k, v.shape)
+            #     np.save(f"/workspace/hesensen/deepmd_backend/python_infer_data/st_model_{k}.npy", v.numpy())
+            #     print(f"finished save {k}")
+            # exit()
         else:
             eval_outputs = self.model(
                 eval_inputs["coord"],  # [2880] paddle.float64
diff --git a/examples/water/lmp/Model_1000000_with_buffer.pdiparams b/examples/water/lmp/Model_1000000_with_buffer.pdiparams
new file mode 100644
index 0000000000..3b4a25e6ba
Binary files /dev/null and b/examples/water/lmp/Model_1000000_with_buffer.pdiparams differ
diff --git a/examples/water/lmp/Model_1000000_with_buffer.pdmodel b/examples/water/lmp/Model_1000000_with_buffer.pdmodel
new file mode 100644
index 0000000000..fbc0689402
Binary files /dev/null and b/examples/water/lmp/Model_1000000_with_buffer.pdmodel differ
diff --git a/examples/water/lmp/in.lammps b/examples/water/lmp/in.lammps
index ea3b5d52cd..5883016634 100644
--- a/examples/water/lmp/in.lammps
+++ b/examples/water/lmp/in.lammps
@@ -12,7 +12,7 @@ mass 		1 16
 mass		2 2
 
 # See https://deepmd.rtfd.io/lammps/ for usage
-pair_style	deepmd frozen_model.pb
+pair_style	deepmd Model_1000000_with_buffer
 # If atom names (O H in this example) are not set in the pair_coeff command, the type_map defined by the training parameter will be used by default.
 pair_coeff  * *	O H
 
diff --git a/examples/water/lmp/model.pb b/examples/water/lmp/model.pb
new file mode 100644
index 0000000000..fa246dffba
Binary files /dev/null and b/examples/water/lmp/model.pb differ
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ccf9641795..7a617d4c02 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,6 +2,98 @@
 cmake_minimum_required(VERSION 3.16)
 project(DeePMD)
 
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
+
+if(NOT DEFINED PADDLE_LIB)
+  message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
+endif()
+set(PADDLE_LIB ${PADDLE_LIB} CACHE PATH "/path/paddle/lib")
+
+include_directories("${PADDLE_LIB}/")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+
+# add custom operators
+option(USE_TENSORRT "Compile demo with TensorRT." OFF)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+      set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+    endif()
+  endif(NOT WIN32)
+endif()
+
+if (NOT WIN32)
+  if (USE_TENSORRT AND WITH_GPU)
+      include_directories("${TENSORRT_INCLUDE_DIR}")
+      link_directories("${TENSORRT_LIB_DIR}")
+  endif()
+endif(NOT WIN32)
+
+
+if(WITH_STATIC_LIB)
+  set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+else()
+  if(WIN32)
+    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+  endif()
+endif()
+
+if (NOT WIN32)
+  set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags protobuf  xxhash
+      ${EXTERNAL_LIB})
+else()
+  set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags_static libprotobuf  xxhash ${EXTERNAL_LIB})
+  set(DEPS ${DEPS} shlwapi.lib)
+endif(NOT WIN32)
+
+if(WITH_GPU)
+  if(NOT WIN32)
+    if (USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    if(USE_TENSORRT)
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX})
+      set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX})
+    endif()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  endif()
+endif()
+
+
 option(BUILD_TESTING "Build test and enable converage" OFF)
 set(DEEPMD_C_ROOT
     ""
@@ -175,6 +267,7 @@ if(BUILD_CPP_IF)
   set(LIB_DEEPMD_CC "deepmd_cc")
   set(LIB_DEEPMD_C "deepmd_c")
   if(USE_CUDA_TOOLKIT)
+    set(LIB_DEEPMD_OP_DEVICE "deepmd_paddle_op_cuda")
     set(LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda")
   elseif(USE_ROCM_TOOLKIT)
     set(LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm")
@@ -260,6 +353,33 @@ if(BUILD_CPP_IF)
   endif()
 endif(BUILD_CPP_IF)
 
+# if(WIN32)
+#   if(USE_TENSORRT)
+#     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+#             COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}
+#               ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+#             COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}
+#               ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+#     )
+#   endif()
+#   if(WITH_MKL)
+#     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+#           COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release
+#           COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release
+#           COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll  ${CMAKE_BINARY_DIR}/Release
+#     )
+#   else()
+#     add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+#           COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release
+#     )
+#   endif()
+#   if(NOT WITH_STATIC_LIB)
+#       add_custom_command(TARGET ${DEMO_NAME} POST_BUILD
+#         COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE}
+#       )
+#   endif()
+# endif()
+
 # uninstall target
 configure_file(
   "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in"
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index 613af8c276..8f890bc5b3 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -54,11 +54,25 @@ DP_DeepPot* DP_NewDeepPotWithParam2(const char* c_model,
                                     const char* c_file_content,
                                     const int size_file_content) {
   std::string model(c_model);
+  printf("==>> [DP_NewDeepPotWithParam2]\n");
   std::string file_content(c_file_content, c_file_content + size_file_content);
   DP_NEW_OK(DP_DeepPot, deepmd::DeepPot dp(model, gpu_rank, file_content);
             DP_DeepPot* new_dp = new DP_DeepPot(dp); return new_dp;)
 }
 
+// DP_DeepPot* DP_NewDeepPotWithParam3(const char* c_pdmodel_path,
+//                                     const char* c_pdiparams_path,
+//                                     const int gpu_rank,
+//                                     const char* c_file_content,
+//                                     const int size_file_content) {
+//   std::string pdmodel(c_pdmodel_path);
+//   std::string pdiparams(c_pdiparams_path);
+//   printf("==>> [DP_NewDeepPotWithParam3 Paddle ver]\n");
+//   std::string file_content(c_file_content, c_file_content + size_file_content);
+//   DP_NEW_OK(DP_DeepPot, deepmd::DeepPot dp(pdmodel, pdiparams, gpu_rank, file_content);
+//             DP_DeepPot* new_dp = new DP_DeepPot(dp); return new_dp;)
+// }
+
 DP_DeepPotModelDevi::DP_DeepPotModelDevi() {}
 DP_DeepPotModelDevi::DP_DeepPotModelDevi(deepmd::DeepPotModelDevi& dp)
     : dp(dp) {
diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h
index 1c440e2668..a5bc0d8aa8 100644
--- a/source/api_cc/include/DeepPot.h
+++ b/source/api_cc/include/DeepPot.h
@@ -291,12 +291,18 @@ class DeepPot {
   void get_type_map(std::string& type_map);
 
  private:
+  std::shared_ptr<paddle_infer::Predictor> predictor = nullptr;
+  paddle_infer::Config config;
+  int math_lib_num_threads;
   tensorflow::Session* session;
   int num_intra_nthreads, num_inter_nthreads;
   tensorflow::GraphDef* graph_def;
   bool inited;
   template <class VT>
   VT get_scalar(const std::string& name) const;
+  template <class VT>
+  VT paddle_get_scalar(const std::string& name) const;
+
   // VALUETYPE get_rcut () const;
   // int get_ntypes () const;
   double rcut;
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index bed8e97e82..04621e05af 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -14,6 +14,7 @@
 #else
 #include "tf_public.h"
 #endif
+#include "paddle/include/paddle_inference_api.h"
 
 namespace deepmd {
 
@@ -191,6 +192,16 @@ VT session_get_scalar(tensorflow::Session* session,
                       const std::string name,
                       const std::string scope = "");
 
+/**
+ * @brief Get the value of a tensor.
+ * @param[in] predictor Paddle inference predictor.
+ * @param[in] name The name of the tensor.
+ * @return The value of the tensor.
+ **/
+template <typename VT>
+VT predictor_get_scalar(const std::shared_ptr<paddle_infer::Predictor>& predictor,
+                        const std::string name_);
+
 /**
  * @brief Get the vector of a tensor.
  * @param[out] o_vec The output vector.
@@ -215,6 +226,16 @@ int session_get_dtype(tensorflow::Session* session,
                       const std::string name,
                       const std::string scope = "");
 
+/**
+ * @brief Get the type of a tensor.
+ * @param[in] predictor Paddle inference predictor.
+ * @param[in] name The name of the tensor.
+ * @return The type of the tensor.
+ **/
+paddle_infer::DataType predictor_get_dtype(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::string& name_);
+
 /**
  * @brief Get input tensors.
  * @param[out] input_tensors Input tensors.
@@ -270,6 +291,35 @@ int session_input_tensors(
     const int ago,
     const std::string scope = "");
 
+/**
+ * @brief Send input data into paddle tensor handles.
+ * @param[in] predictor The paddle predictor pointer.
+ * @param[in] dcoord_ Coordinates of atoms.
+ * @param[in] ntypes Number of atom types.
+ * @param[in] datype_ Atom types.
+ * @param[in] dlist Neighbor list.
+ * @param[in] fparam_ Frame parameters.
+ * @param[in] aparam_ Atom parameters.
+ * @param[in] atommap Atom map.
+ * @param[in] nghost Number of ghost atoms.
+ * @param[in] ago Update the internal neighbour list if ago is 0.
+ * @param[in] scope The scope of the tensors.
+ */
+template <typename MODELTYPE, typename VALUETYPE>
+int predictor_input_tensors(
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const std::vector<VALUETYPE>& dcoord_,
+    const int& ntypes,
+    const std::vector<int>& datype_,
+    const std::vector<VALUETYPE>& dbox,
+    InputNlist& dlist,
+    const std::vector<VALUETYPE>& fparam_,
+    const std::vector<VALUETYPE>& aparam_,
+    const deepmd::AtomMap& atommap,
+    const int nghost,
+    const int ago,
+    const std::string scope = "");
+
 /**
  * @brief Get input tensors for mixed type.
  * @param[out] input_tensors Input tensors.
diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc
index d8f0d8a8fe..2ee6217c84 100644
--- a/source/api_cc/src/DeepPot.cc
+++ b/source/api_cc/src/DeepPot.cc
@@ -4,6 +4,8 @@
 
 #include "AtomMap.h"
 #include "device.h"
+#include "paddle/include/paddle_inference_api.h"
+// #include "glog/logging.h"
 
 using namespace tensorflow;
 using namespace deepmd;
@@ -115,6 +117,7 @@ template void run_model<float, float>(
     const int nframes,
     const int nghost);
 
+/*下面这个函数是接受转发参数，真正运行计算的函数*/
 template <typename MODELTYPE, typename VALUETYPE>
 static void run_model(
     std::vector<ENERGYTYPE>& dener,
@@ -215,6 +218,166 @@ static void run_model(
                               nframes, nall);
 }
 
+// paddle_run_model开始
+template <typename MODELTYPE, typename VALUETYPE>
+static void paddle_run_model(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost = 0) {
+  unsigned nloc = atommap.get_type().size();
+  unsigned nall = nloc + nghost;
+  dener.resize(nframes);
+  if (nloc == 0) {
+    // no backward map needed
+    // dforce of size nall * 3
+    dforce_.resize(nframes * nall * 3);
+    fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0);
+    // dvirial of size 9
+    dvirial.resize(nframes * 9);
+    fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0);
+    // datom_energy_ of size nall
+    datom_energy_.resize(nframes * nall);
+    fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0);
+    // datom_virial_ of size nall * 9
+    datom_virial_.resize(nframes * nall * 9);
+    fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0);
+    return;
+  }
+
+  /* Running inference */
+  if (!predictor->Run()) {
+    throw deepmd::deepmd_exception("Paddle inference failed");
+  }
+
+  /* Get output handles*/
+  auto output_names = predictor->GetOutputNames();
+  auto output_atom_ener_tensor = predictor->GetOutputHandle(output_names[0]);
+  auto output_atom_virial_tensor = predictor->GetOutputHandle(output_names[1]);
+  auto output_atype_tensor = predictor->GetOutputHandle(output_names[2]);
+  auto output_coord_tensor = predictor->GetOutputHandle(output_names[3]);
+  auto output_energy_tensor = predictor->GetOutputHandle(output_names[4]);
+  auto output_force_tensor = predictor->GetOutputHandle(output_names[5]);
+  auto output_virial_tensor = predictor->GetOutputHandle(output_names[6]);
+
+  // 获取 Output Tensor 的维度信息
+  std::vector<int> output_atom_ener_shape = output_atom_ener_tensor->shape();
+  int output_atom_ener_size =
+      std::accumulate(output_atom_ener_shape.begin(),
+                      output_atom_ener_shape.end(), 1, std::multiplies<int>());
+  std::vector<int> output_atom_virial_shape =
+      output_atom_virial_tensor->shape();
+  int output_atom_virial_size = std::accumulate(
+      output_atom_virial_shape.begin(), output_atom_virial_shape.end(), 1,
+      std::multiplies<int>());
+  std::vector<int> output_atype_shape = output_atype_tensor->shape();
+  int output_atype_size =
+      std::accumulate(output_atype_shape.begin(), output_atype_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_coord_shape = output_coord_tensor->shape();
+  int output_coord_size =
+      std::accumulate(output_coord_shape.begin(), output_coord_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_energy_shape = output_energy_tensor->shape();
+  int output_energy_size =
+      std::accumulate(output_energy_shape.begin(), output_energy_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_force_shape = output_force_tensor->shape();
+  int output_force_size =
+      std::accumulate(output_force_shape.begin(), output_force_shape.end(), 1,
+                      std::multiplies<int>());
+  std::vector<int> output_virial_shape = output_virial_tensor->shape();
+  int output_virial_size =
+      std::accumulate(output_virial_shape.begin(), output_virial_shape.end(), 1,
+                      std::multiplies<int>());
+
+  // get data of output_atom_ener
+  std::vector<VALUETYPE> output_atom_ener_data;
+  output_atom_ener_data.resize(output_atom_ener_size);
+  output_atom_ener_tensor->CopyToCpu(output_atom_ener_data.data());
+  // get data of output_atom_virial
+  std::vector<VALUETYPE> output_atom_virial_data;
+  output_atom_virial_data.resize(output_atom_virial_size);
+  output_atom_virial_tensor->CopyToCpu(output_atom_virial_data.data());
+  // get data of output_atype
+  // std::vector<VALUETYPE> output_atype_data;
+  // output_atype_data.resize(output_atype_size);
+  // output_atype_tensor->CopyToCpu(output_atype_data.data());
+  // get data of output_coord
+  std::vector<VALUETYPE> output_coord_data;
+  output_coord_data.resize(output_coord_size);
+  output_coord_tensor->CopyToCpu(output_coord_data.data());
+  // get data of output_energy
+  std::vector<VALUETYPE> output_energy_data;
+  output_energy_data.resize(output_energy_size);
+  output_energy_tensor->CopyToCpu(output_energy_data.data());
+  // get data of output_force
+  std::vector<VALUETYPE> output_force_data;
+  output_force_data.resize(output_force_size);
+  output_force_tensor->CopyToCpu(output_force_data.data());
+  // get data of output_virial
+  // std::vector<VALUETYPE> output_virial_data;
+  // output_virial_data.resize(output_virial_size);
+  // output_virial_tensor->CopyToCpu(output_virial_data.data());
+
+  std::vector<VALUETYPE> dforce(nframes * 3 * nall);
+  std::vector<VALUETYPE> datom_energy(nframes * nall, 0);
+  std::vector<VALUETYPE> datom_virial(nframes * 9 * nall);
+  dvirial.resize(nframes * 9);
+  for (int ii = 0; ii < nframes; ++ii) {
+    dener[ii] = output_energy_data[ii];
+  }
+  for (int ii = 0; ii < nframes * nall * 3; ++ii) {
+    dforce[ii] = output_force_data[ii];
+  }
+  for (int ii = 0; ii < nframes; ++ii) {
+    for (int jj = 0; jj < nloc; ++jj) {
+      datom_energy[ii * nall + jj] = output_atom_ener_data[ii * nloc + jj];
+    }
+  }
+  for (int ii = 0; ii < nframes * nall * 9; ++ii) {
+    datom_virial[ii] = output_atom_virial_data[ii];
+  }
+  // set dvirial to zero, prevent input vector is not zero (#1123)
+  std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.);
+  for (int kk = 0; kk < nframes; ++kk) {
+    for (int ii = 0; ii < nall; ++ii) {
+      dvirial[kk * 9 + 0] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 0];
+      dvirial[kk * 9 + 1] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 1];
+      dvirial[kk * 9 + 2] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 2];
+      dvirial[kk * 9 + 3] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 3];
+      dvirial[kk * 9 + 4] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 4];
+      dvirial[kk * 9 + 5] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 5];
+      dvirial[kk * 9 + 6] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 6];
+      dvirial[kk * 9 + 7] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 7];
+      dvirial[kk * 9 + 8] +=
+          (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 8];
+    }
+  }
+  dforce_ = dforce;
+  datom_energy_ = datom_energy;
+  datom_virial_ = datom_virial;
+  atommap.backward<VALUETYPE>(dforce_.begin(), dforce.begin(), 3, nframes,
+                              nall);
+  atommap.backward<VALUETYPE>(datom_energy_.begin(), datom_energy.begin(), 1,
+                              nframes, nall);
+  atommap.backward<VALUETYPE>(datom_virial_.begin(), datom_virial.begin(), 9,
+                              nframes, nall);
+}
+
 template void run_model<double, double>(
     std::vector<ENERGYTYPE>& dener,
     std::vector<double>& dforce_,
@@ -263,6 +426,51 @@ template void run_model<float, float>(
     const int& nframes,
     const int& nghost);
 
+/*start paddle run_model*/
+template void paddle_run_model<double, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<double, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, double>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, float>(
+    std::vector<ENERGYTYPE>& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
 // end multiple frames
 
 // start single frame
@@ -325,6 +533,7 @@ template void run_model<float, float>(
     const int nframes,
     const int nghost);
 
+/*Forwarding function of tensorflow*/
 template <typename MODELTYPE, typename VALUETYPE>
 static void run_model(
     ENERGYTYPE& dener,
@@ -346,6 +555,27 @@ static void run_model(
   dener = dener_[0];
 }
 
+/*Forwarding function of paddle*/
+template <typename MODELTYPE, typename VALUETYPE>
+static void paddle_run_model(
+    ENERGYTYPE& dener,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial,
+    std::vector<VALUETYPE>& datom_energy_,
+    std::vector<VALUETYPE>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes = 1,
+    const int& nghost = 0) {
+  assert(nframes == 1);
+  std::vector<ENERGYTYPE> dener_(1);
+  // call multi-frame version
+  paddle_run_model<MODELTYPE, VALUETYPE>(dener_, dforce_, dvirial,
+                                         datom_energy_, datom_virial_,
+                                         predictor, atommap, nframes, nghost);
+  dener = dener_[0];
+}
+
 template void run_model<double, double>(
     ENERGYTYPE& dener,
     std::vector<double>& dforce_,
@@ -394,6 +624,51 @@ template void run_model<float, float>(
     const int& nframes,
     const int& nghost);
 
+/*start paddle */
+template void paddle_run_model<double, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<double, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, double>(
+    ENERGYTYPE& dener,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial,
+    std::vector<double>& datom_energy_,
+    std::vector<double>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
+template void paddle_run_model<float, float>(
+    ENERGYTYPE& dener,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial,
+    std::vector<float>& datom_energy_,
+    std::vector<float>& datom_virial_,
+    const std::shared_ptr<paddle_infer::Predictor>& predictor,
+    const deepmd::AtomMap& atommap,
+    const int& nframes,
+    const int& nghost);
+
 // end single frame
 
 DeepPot::DeepPot()
@@ -417,6 +692,28 @@ void DeepPot::init(const std::string& model,
               << std::endl;
     return;
   }
+  std::string pdmodel_path = "";
+  std::string pdiparams_path = "";
+  bool use_paddle_inference = false;
+  std::string tf_model = model;
+  if (model.find(".pb") == std::string::npos) {
+    pdmodel_path = model + ".pdmodel";
+    pdiparams_path = model + ".pdiparams";
+    use_paddle_inference = true;
+    tf_model = "model.pb";
+  }
+  math_lib_num_threads = 1;
+
+  if (use_paddle_inference) {
+    config.SetModel(pdmodel_path, pdiparams_path);
+    config.SwitchIrOptim(true);
+    config.EnableUseGpu(8192, 0);
+    // std::cout << "IR Optim is: " << config.ir_optim() << std::endl;
+    // config.EnableMKLDNN();
+    config.EnableMemoryOptim();
+    // config.EnableProfile();
+    predictor = paddle_infer::CreatePredictor(config);
+  }
   SessionOptions options;
   get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
   options.config.set_inter_op_parallelism_threads(num_inter_nthreads);
@@ -424,7 +721,7 @@ void DeepPot::init(const std::string& model,
   deepmd::load_op_library();
 
   if (file_content.size() == 0)
-    check_status(ReadBinaryProto(Env::Default(), model, graph_def));
+    check_status(ReadBinaryProto(Env::Default(), tf_model, graph_def));
   else
     (*graph_def).ParseFromString(file_content);
   int gpu_num = -1;
@@ -443,19 +740,47 @@ void DeepPot::init(const std::string& model,
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   check_status(NewSession(options, &session));
   check_status(session->Create(*graph_def));
+  if (use_paddle_inference) {
+    /*
+    tensorflow::DT_DOUBLE = 2
+    tensorflow::DT_FLOAT = 1
+    paddle_infer::DataType::FLOAT64 = 7
+    paddle_infer::DataType::FLOAT32 = 0
+    * st_model.descrpt.buffer_rcut.name = generated_tensor_0
+    * st_model.descrpt.buffer_ntypes.name = generated_tensor_2
+    * st_model.fitting.buffer_dfparam.name = generated_tensor_9
+    * st_model.fitting.buffer_daparam.name = generated_tensor_10
+    **/
+    model_version = "0.0";
+    dtype = predictor_get_dtype(predictor, "generated_tensor_0");
+    if (dtype == paddle_infer::DataType::FLOAT64) {
+      rcut = paddle_get_scalar<double>("generated_tensor_0");
+      dtype = tensorflow::DT_DOUBLE;
+    } else {
+      rcut = paddle_get_scalar<float>("generated_tensor_0");
+      dtype = tensorflow::DT_FLOAT;
+    }
+    ntypes = paddle_get_scalar<int64_t>("generated_tensor_2");
+    // ntypes_spin = paddle_get_scalar<int64_t>("buffer_ntypes_spin");
+    ntypes_spin = 0;
+    dfparam = paddle_get_scalar<int64_t>("generated_tensor_9");
+    daparam = paddle_get_scalar<int64_t>("generated_tensor_10");
+    model_type = "ener";
+    return ;
+  }
   try {
     model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
   } catch (deepmd::tf_exception& e) {
     // no model version defined in old models
     model_version = "0.0";
   }
-  if (!model_compatable(model_version)) {
-    throw deepmd::deepmd_exception(
-        "incompatable model: version " + model_version +
-        " in graph, but version " + global_model_version +
-        " supported "
-        "See https://deepmd.rtfd.io/compatability/ for details.");
-  }
+  //   if (!model_compatable(model_version)) {
+  //     throw deepmd::deepmd_exception(
+  //         "incompatable model: version " + model_version +
+  //         " in graph, but version " + global_model_version +
+  //         " supported "
+  //         "See https://deepmd.rtfd.io/compatability/ for details.");
+  //   }
   dtype = session_get_dtype(session, "descrpt_attr/rcut");
   if (dtype == tensorflow::DT_DOUBLE) {
     rcut = get_scalar<double>("descrpt_attr/rcut");
@@ -488,6 +813,11 @@ VT DeepPot::get_scalar(const std::string& name) const {
   return session_get_scalar<VT>(session, name);
 }
 
+template <class VT>
+VT DeepPot::paddle_get_scalar(const std::string& name) const {
+  return predictor_get_scalar<VT>(predictor, name);
+}
+
 template <typename VALUETYPE>
 void DeepPot::validate_fparam_aparam(
     const int& nframes,
@@ -937,12 +1267,26 @@ void DeepPot::compute(ENERGYVTYPE& dener,
   }
 
   if (dtype == tensorflow::DT_DOUBLE) {
-    int ret = session_input_tensors<double>(input_tensors, dcoord, ntypes,
-                                            datype, dbox, nlist, fparam, aparam,
-                                            atommap, nghost_real, ago);
-    assert(nloc_real == ret);
-    run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
-                      session, input_tensors, atommap, nframes, nghost_real);
+    int ret = 0;
+    if (predictor == nullptr) {
+      /* run tensorflow inference if paddle predictor is nullptr*/
+      int ret = session_input_tensors<double>(input_tensors, dcoord, ntypes,
+                                              datype, dbox, nlist, fparam,
+                                              aparam, atommap, nghost_real,
+                                              ago);
+      assert(nloc_real == ret);
+      run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
+                        session, input_tensors, atommap, nframes, nghost_real);
+    }
+    /* run paddle inference if paddle predictor exist*/
+    else if (predictor != nullptr) {
+      int ret = predictor_input_tensors<double>(predictor, dcoord, ntypes, datype, dbox,
+                                                nlist, fparam, aparam, atommap,
+                                                nghost_real, ago);
+      assert(nloc_real == ret);
+      paddle_run_model<double>(dener, dforce, dvirial, datom_energy, datom_virial,
+                               predictor, atommap, nframes, nghost_real);
+    }
   } else {
     int ret = session_input_tensors<float>(input_tensors, dcoord, ntypes,
                                            datype, dbox, nlist, fparam, aparam,
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 380a2910f6..38c63ac788 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -40,18 +40,24 @@ static std::vector<std::string> split(const std::string& input_,
 bool deepmd::model_compatable(std::string& model_version) {
   std::vector<std::string> words_mv = split(model_version, ".");
   std::vector<std::string> words_gmv = split(global_model_version, ".");
-  if (words_mv.size() != 2) {
-    throw deepmd::deepmd_exception("invalid graph model version string " +
-                                   model_version);
-  }
-  if (words_gmv.size() != 2) {
-    throw deepmd::deepmd_exception("invalid supported model version string " +
-                                   global_model_version);
-  }
-  int model_version_major = atoi(words_mv[0].c_str());
-  int model_version_minor = atoi(words_mv[1].c_str());
-  int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str());
-  int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str());
+  // if (words_mv.size() != 2) {
+  //   throw deepmd::deepmd_exception("invalid graph model version string " +
+  //                                  model_version);
+  // }
+  // if (words_gmv.size() != 2) {
+  //   throw deepmd::deepmd_exception("invalid supported model version string " +
+  //                                  global_model_version);
+  // }
+  // int model_version_major = atoi(words_mv[0].c_str());
+  // int model_version_minor = atoi(words_mv[1].c_str());
+  // int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str());
+  // int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str());
+  int model_version_major = 1;
+  int model_version_minor = 1;
+  int MODEL_VERSION_MAJOR = 1;
+  int MODEL_VERSION_MINOR = 1;
+  printf(">>> debug\n");
+  return true;
   if (model_version_major != MODEL_VERSION_MAJOR ||
       model_version_minor > MODEL_VERSION_MINOR) {
     return false;
diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt
index af88cb5ae6..beefef619e 100644
--- a/source/lib/CMakeLists.txt
+++ b/source/lib/CMakeLists.txt
@@ -13,6 +13,8 @@ if(USE_CUDA_TOOLKIT)
   add_definitions("-DGOOGLE_CUDA")
   add_subdirectory(src/cuda)
   set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda)
+  add_subdirectory(paddle_src/)
+  set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_paddle_op_cuda)
   target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS})
   # gpu_cuda.h
   target_include_directories(
diff --git a/source/lib/paddle_src/CMakeLists.txt b/source/lib/paddle_src/CMakeLists.txt
new file mode 100644
index 0000000000..92f40f510c
--- /dev/null
+++ b/source/lib/paddle_src/CMakeLists.txt
@@ -0,0 +1,248 @@
+# required cmake version
+cmake_minimum_required(VERSION 3.16)
+# project name
+project(deepmd_paddle_op_cuda)
+
+# SET(CUDA_SEPARABLE_COMPILATION ON)
+find_package(CUDA REQUIRED)
+if(NOT CUDA_FOUND)
+  message(STATUS "CUDA not found. Project will not be built.")
+endif(NOT CUDA_FOUND)
+
+# take dynamic open cudart library replace of static one so it's not required
+# when using CPUs
+# add_subdirectory(cudart)
+# important: it must be before cuda_add_library and any link target to cudart
+set(CUDA_LIBRARIES deepmd_dyn_cudart)
+
+# set c++ version c++11
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CUDA_STANDARD 11)
+# nvcc -o libdeeppaddle_md_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true
+# -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu
+# -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include path to cub.
+# for searching device compute capability,
+# https://developer.nvidia.com/cuda-gpus
+
+# cub has been included in CUDA Toolkit 11, we do not need to include it any
+# more see https://github.com/NVIDIA/cub
+include_directories("${PADDLE_LIB}/paddle/include")
+set(LIB_PATH "paddle/lib")
+find_library(PADDLE_FLUID_SHARED_LIB NAMES "libpaddle_inference.so" PATHS ${PADDLE_INFERENCE_DIR}/${LIB_PATH})
+if(${CUDA_VERSION_MAJOR} LESS_EQUAL "10")
+  include_directories(cub)
+endif()
+
+message(STATUS "CUDA major version is " ${CUDA_VERSION_MAJOR})
+
+if(${CUDA_VERSION_MAJOR} GREATER "11"
+   OR (${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR}
+                                               GREATER_EQUAL "5"))
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -arch=all; # embeds a compiled code image for all supported architectures
+                 # (sm_*), and a PTX program for the highest major virtual
+                 # architecture
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} GREATER
+                                               "0")
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -gencode
+      arch=compute_80,code=sm_80; # Anpere - A100
+      -gencode
+      arch=compute_86,code=sm_86; # Anpere - RTX 3090
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} STREQUAL
+                                               "0")
+  # nvcc flags
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -gencode
+      arch=compute_80,code=sm_80; # Anpere - A100
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "10")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30; # Tesla K10, Quadro K600 K420 K410,
+      -gencode
+      arch=compute_35,code=sm_35; # Tesla K20 K40, TITAN Z Black, GTX 780Ti 780
+      -gencode
+      arch=compute_37,code=sm_37; # Tesla K80
+      -gencode
+      arch=compute_50,code=sm_50; # Quadro 620 1200
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40 M40, Quadro M6000 M5000 M4000
+                                  # M2000, TITAN X, GTX 980Ti 980 970 960 950
+      -gencode
+      arch=compute_53,code=sm_53; # Jetson TX1, Tegra X1
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -gencode
+      arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "9")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -gencode
+      arch=compute_70,code=sm_70; # Volta  - GV100/Tesla V100, GTX 1180 (GV104)
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "8")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -gencode
+      arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic
+                                  # Pascal)
+      -gencode
+      arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX
+                                  # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla
+                                  # P4, Discrete GPU on the NVIDIA Drive PX2
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+elseif(${CUDA_VERSION_MAJOR} STREQUAL "7")
+  set(CUDA_NVCC_FLAGS
+      -gencode
+      arch=compute_30,code=sm_30;
+      -gencode
+      arch=compute_35,code=sm_35;
+      -gencode
+      arch=compute_37,code=sm_37;
+      -gencode
+      arch=compute_50,code=sm_50;
+      -gencode
+      arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000...
+      -gencode
+      arch=compute_53,code=sm_53;
+      -O3;
+      -Xcompiler
+      -fPIC;
+      ${CUDA_NVCC_FLAGS})
+else()
+  message(FATAL_ERROR "unsupported CUDA_VERSION " ${CUDA_VERSION}
+                      ", please use a newer version (>=7.0) of CUDA toolkit!")
+endif()
+
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT"
+)
+
+if(${CUDA_VERSION_MAJOR} LESS_EQUAL "11")
+  # check unsupported -std=c++17
+  set(CMAKE_CXX_FLAGS_LIST "${CMAKE_CXX_FLAGS}")
+  separate_arguments(CMAKE_CXX_FLAGS_LIST)
+  if("-std=c++17" IN_LIST CMAKE_CXX_FLAGS_LIST)
+    message(
+      WARNING
+        "Environment variable CXXFLAGS contains flag --std=c++17 which is unsupported by CUDA ${CUDA_VERSION}. Such flag will be removed automatically."
+    )
+    string(REPLACE "-std=c++17" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  endif()
+endif()
+
+file(GLOB SOURCE_FILES "*.cu" "*.cc")
+cuda_add_library(deepmd_paddle_op_cuda SHARED ${SOURCE_FILES})
+target_link_libraries(deepmd_paddle_op_cuda "${PADDLE_LIB}/paddle/lib/libpaddle_inference.so")
+target_include_directories(
+  deepmd_paddle_op_cuda
+  PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/>
+         $<INSTALL_INTERFACE:include>
+         "${PADDLE_LIB}/paddle/include/")
+target_precompile_headers(deepmd_paddle_op_cuda PUBLIC [["device.h"]])
+if(APPLE)
+  set_target_properties(deepmd_paddle_op_cuda PROPERTIES INSTALL_RPATH @loader_path)
+else()
+  set_target_properties(deepmd_paddle_op_cuda PROPERTIES INSTALL_RPATH "$ORIGIN" IMPORTED_LOCATION ${PADDLE_FLUID_SHARED_LIB})
+endif()
+
+if(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+  install(
+    TARGETS deepmd_paddle_op_cuda
+    EXPORT ${CMAKE_PROJECT_NAME}Targets
+    DESTINATION lib/)
+endif(BUILD_CPP_IF AND NOT BUILD_PY_IF)
+if(BUILD_PY_IF)
+  install(TARGETS deepmd_paddle_op_cuda DESTINATION deepmd/op/)
+endif(BUILD_PY_IF)
+
diff --git a/source/lib/paddle_src/neighbor_stat.cu b/source/lib/paddle_src/paddle_neighbor_stat.cc
similarity index 64%
rename from source/lib/paddle_src/neighbor_stat.cu
rename to source/lib/paddle_src/paddle_neighbor_stat.cc
index 6754f3efc9..0262a7bdd0 100644
--- a/source/lib/paddle_src/neighbor_stat.cu
+++ b/source/lib/paddle_src/paddle_neighbor_stat.cc
@@ -1,36 +1,22 @@
-// #include <cub/block/block_load.cuh>
-// #include <cub/block/block_radix_sort.cuh>
-// #include <cub/block/block_store.cuh>
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/block/block_store.cuh>
-#include "paddle/extension.h"
-
 #include "device.h"
-#include "prod_virial.h"
-#include "gpu_cuda.h"
-
-#include "paddle/extension.h"
 #include "errors.h"
 #include "neighbor_list.h"
-#include "device.h"
+#include "paddle/extension.h"
 
 #undef PADDLE_WITH_CUDA
-// #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 #define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
-// #define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
 typedef double boxtensor_t;
 typedef double compute_t;
 
 std::vector<paddle::Tensor> NeighborStatOpCPUForward(
-  const paddle::Tensor& coord_tensor,
-  const paddle::Tensor& type_tensor,
-  const paddle::Tensor& natoms_tensor,
-  const paddle::Tensor& box_tensor,
-  const paddle::Tensor& mesh_tensor,
-  float rcut
-) {
+    const paddle::Tensor& coord_tensor /*fp32/64*/,
+    const paddle::Tensor& type_tensor /*int32*/,
+    const paddle::Tensor& natoms_tensor /*int64*/,
+    const paddle::Tensor& box_tensor /*fp32/64*/,
+    const paddle::Tensor& mesh_tensor /*int32*/,
+    const float& rcut) {
   CHECK_INPUT_CPU(coord_tensor);
   CHECK_INPUT_CPU(type_tensor);
   CHECK_INPUT_CPU(natoms_tensor);
@@ -42,7 +28,8 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
   CHECK_INPUT_DIM(natoms_tensor, 1);
   CHECK_INPUT_DIM(box_tensor, 2);
   CHECK_INPUT_DIM(mesh_tensor, 1);
-  PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
 
   const int64_t* natoms = natoms_tensor.data<int64_t>();
   int64_t nloc = natoms[0];
@@ -50,13 +37,13 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
   int64_t nsamples = coord_tensor.shape()[0];
   int64_t ntypes = natoms_tensor.shape()[0] - 2;
 
-  PD_CHECK(nsamples == type_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nsamples == type_tensor.shape()[0],
+           "number of samples should match");
   PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match");
   PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match");
   PD_CHECK(nall == type_tensor.shape()[1], "number of atoms should match");
   PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9");
 
-  // std::cout << "1" << std::endl;
   int nei_mode = 0;
   if (mesh_tensor.shape()[0] == 6) {
     // manual copied pbc
@@ -74,23 +61,15 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
 
   std::vector<int64_t> max_nbor_size_shape = {nloc, ntypes};
   paddle::Tensor max_nbor_size_tensor = paddle::zeros(
-    max_nbor_size_shape,
-    type_tensor.dtype(),
-    type_tensor.place()
-  );
-  // std::cout << "2" << std::endl;
-
-  const auto* coord = coord_tensor.data<float>();
-  // std::cout << "3" << std::endl;
-  const auto* type = type_tensor.data<int>();
-  // std::cout << "4" << std::endl;
-  const auto* box = box_tensor.data<float>();
-  // std::cout << "5" << std::endl;
-  const auto* mesh = mesh_tensor.data<int>();
-  // std::cout << "6" << std::endl;
-  auto *max_nbor_size = max_nbor_size_tensor.mutable_data<int>();
-  // std::cout << "7" << std::endl;
+      max_nbor_size_shape, type_tensor.dtype(), type_tensor.place());
+
+  const float* coord = coord_tensor.data<float>();
+  const int* type = type_tensor.data<int>();
+  const float* box = box_tensor.data<float>();
+  const int* mesh = mesh_tensor.data<int>();
+  int* max_nbor_size = max_nbor_size_tensor.data<int>();
 
+  // set region
   boxtensor_t boxt[9] = {0};
   for (int dd = 0; dd < 9; ++dd) {
     boxt[dd] = box[dd];
@@ -132,7 +111,7 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
     std::vector<int> bk_d_type = d_type;
     std::vector<int> ncell, ngcell;
     copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3,
-                bk_d_type, rcut, region);
+               bk_d_type, rcut, region);
     b_nlist_map = true;
     std::vector<int> nat_stt(3, 0);
     std::vector<int> ext_stt(3), ext_end(3);
@@ -150,18 +129,14 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
 
   int MAX_NNEI = 0;
   for (int ii = 0; ii < nloc; ii++) {
-    MAX_NNEI = MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
+    MAX_NNEI =
+        MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI;
   }
-
   // allocate output tensor for deepmd-kit
   std::vector<int64_t> min_nbor_dist_shape = {nloc * MAX_NNEI};
   paddle::Tensor min_nbor_dist_tensor = paddle::full(
-    min_nbor_dist_shape,
-    10000.0,
-    coord_tensor.dtype(),
-    coord_tensor.place()
-  );
-  auto* min_nbor_dist = min_nbor_dist_tensor.mutable_data<float>();
+      min_nbor_dist_shape, 10000.0, coord_tensor.dtype(), coord_tensor.place());
+  auto* min_nbor_dist = min_nbor_dist_tensor.data<float>();
 
 #pragma omp parallel for
   for (int ii = 0; ii < nloc; ii++) {
@@ -181,35 +156,21 @@ std::vector<paddle::Tensor> NeighborStatOpCPUForward(
   return {max_nbor_size_tensor, min_nbor_dist_tensor};
 }
 
-
 std::vector<paddle::Tensor> NeighborStatForward(
-  const paddle::Tensor& coord_tensor,  /*float32*/
-  const paddle::Tensor& type_tensor,  /*int32*/
-  const paddle::Tensor& natoms_tensor,  /*int64*/
-  const paddle::Tensor& box_tensor,  /*float32*/
-  const paddle::Tensor& mesh_tensor,  /*int32*/
-  float rcut
-) {
+    const paddle::Tensor& coord_tensor,  /*float32*/
+    const paddle::Tensor& type_tensor,   /*int32*/
+    const paddle::Tensor& natoms_tensor, /*int64*/
+    const paddle::Tensor& box_tensor,    /*float32*/
+    const paddle::Tensor& mesh_tensor,   /*int32*/
+    float rcut) {
   if (coord_tensor.is_cpu()) {
-    // std::cout << coord_tensor.dtype() << std::endl;
-    // std::cout << type_tensor.dtype() << std::endl;
-    // std::cout << natoms_tensor.dtype() << std::endl;
-    // std::cout << box_tensor.dtype() << std::endl;
-    // std::cout << mesh_tensor.dtype() << std::endl;
-    return NeighborStatOpCPUForward(
-      coord_tensor,
-      type_tensor,
-      natoms_tensor,
-      box_tensor,
-      mesh_tensor,
-      rcut
-    );
+    return NeighborStatOpCPUForward(coord_tensor, type_tensor, natoms_tensor,
+                                    box_tensor, mesh_tensor, rcut);
   } else {
-    PD_THROW("Unsupported device type for forward function of custom relu operator.");
+    PD_THROW("NeighborStatForward only support CPU device.");
   }
 }
 
-
 PD_BUILD_OP(neighbor_stat)
     .Inputs({"coord", "type", "natoms", "box", "mesh"})
     .Outputs({"max_nbor_size", "min_nbor_dist"})
diff --git a/source/lib/paddle_src/paddle_prod_env_mat.cc b/source/lib/paddle_src/paddle_prod_env_mat.cc
new file mode 100644
index 0000000000..824c993d46
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_env_mat.cc
@@ -0,0 +1,540 @@
+#include <iomanip>
+#include <vector>
+
+#include "coord.h"
+#include "env_mat.h"
+#include "fmt_nlist.h"
+#include "gpu_cuda.h"
+#include "neighbor_list.h"
+#include "paddle/extension.h"
+#include "prod_env_mat.h"
+#include "region.h"
+#include "utilities.h"
+
+typedef long long int_64;
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+// void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel);
+
+template <typename FPTYPE>
+static int _build_nlist_cpu(std::vector<int> &ilist,
+                            std::vector<int> &numneigh,
+                            std::vector<int *> &firstneigh,
+                            std::vector<std::vector<int>> &jlist,
+                            int &max_nnei,
+                            int &mem_nnei,
+                            const FPTYPE *coord,
+                            const int &nloc,
+                            const int &new_nall,
+                            const int &max_nnei_trial,
+                            const float &rcut_r) {
+  int tt;
+  for (tt = 0; tt < max_nnei_trial; ++tt) {
+    for (int ii = 0; ii < nloc; ++ii) {
+      jlist[ii].resize(mem_nnei);
+      firstneigh[ii] = &jlist[ii][0];
+    }
+    deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+    int ret = build_nlist_cpu(inlist, &max_nnei, coord, nloc, new_nall,
+                              mem_nnei, rcut_r);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_nnei *= 2;
+    }
+  }
+  return (tt != max_nnei_trial);
+}
+
+template <typename FPTYPE>
+static int _norm_copy_coord_cpu(std::vector<FPTYPE> &coord_cpy,
+                                std::vector<int> &type_cpy,
+                                std::vector<int> &idx_mapping,
+                                int &nall,
+                                int &mem_cpy,
+                                const FPTYPE *coord,
+                                const FPTYPE *box,
+                                const int *type,
+                                const int &nloc,
+                                const int &max_cpy_trial,
+                                const float &rcut_r) {
+  std::vector<FPTYPE> tmp_coord(nall * 3);
+  std::copy(coord, coord + nall * 3, tmp_coord.begin());
+  deepmd::Region<FPTYPE> region;
+  init_region_cpu(region, box);
+  normalize_coord_cpu(&tmp_coord[0], nall, region);
+  int tt;
+  for (tt = 0; tt < max_cpy_trial; ++tt) {
+    coord_cpy.resize(mem_cpy * 3);
+    type_cpy.resize(mem_cpy);
+    idx_mapping.resize(mem_cpy);
+    int ret =
+        copy_coord_cpu(&coord_cpy[0], &type_cpy[0], &idx_mapping[0], &nall,
+                       &tmp_coord[0], type, nloc, mem_cpy, rcut_r, region);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_cpy *= 2;
+    }
+  }
+  return (tt != max_cpy_trial);
+}
+
+template <typename FPTYPE>
+static void _prepare_coord_nlist_cpu(FPTYPE const **coord,
+                                     std::vector<FPTYPE> &coord_cpy,
+                                     int const **type,
+                                     std::vector<int> &type_cpy,
+                                     std::vector<int> &idx_mapping,
+                                     deepmd::InputNlist &inlist,
+                                     std::vector<int> &ilist,
+                                     std::vector<int> &numneigh,
+                                     std::vector<int *> &firstneigh,
+                                     std::vector<std::vector<int>> &jlist,
+                                     int &new_nall,
+                                     int &mem_cpy,
+                                     int &mem_nnei,
+                                     int &max_nbor_size,
+                                     const FPTYPE *box,
+                                     const int *mesh_tensor_data,
+                                     const int &nloc,
+                                     const int &nei_mode,
+                                     const float &rcut_r,
+                                     const int &max_cpy_trial,
+                                     const int &max_nnei_trial) {
+  inlist.inum = nloc;
+  if (nei_mode != 3) {
+    // build nlist by myself
+    // normalize and copy coord
+    if (nei_mode == 1) {
+      int copy_ok = _norm_copy_coord_cpu(coord_cpy, type_cpy, idx_mapping,
+                                         new_nall, mem_cpy, *coord, box, *type,
+                                         nloc, max_cpy_trial, rcut_r);
+      PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
+      *coord = &coord_cpy[0];
+      *type = &type_cpy[0];
+    }
+    // build nlist
+    int build_ok = _build_nlist_cpu(ilist, numneigh, firstneigh, jlist,
+                                    max_nbor_size, mem_nnei, *coord, nloc,
+                                    new_nall, max_nnei_trial, rcut_r);
+    PD_CHECK(build_ok, "cannot allocate mem for nlist");
+    inlist.ilist = &ilist[0];
+    inlist.numneigh = &numneigh[0];
+    inlist.firstneigh = &firstneigh[0];
+  } else {
+    // copy pointers to nlist data
+    memcpy(&inlist.ilist, 4 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.numneigh, 8 + mesh_tensor_data, sizeof(int *));
+    memcpy(&inlist.firstneigh, 12 + mesh_tensor_data, sizeof(int **));
+    max_nbor_size = max_numneigh(inlist);
+  }
+}
+
+static void _map_nlist_cpu(int *nlist,
+                           const int *idx_mapping,
+                           const int &nloc,
+                           const int &nnei) {
+  for (int ii = 0; ii < nloc; ++ii) {
+    for (int jj = 0; jj < nnei; ++jj) {
+      int record = nlist[ii * nnei + jj];
+      if (record >= 0) {
+        nlist[ii * nnei + jj] = idx_mapping[record];
+      }
+    }
+  }
+}
+
+std::vector<paddle::Tensor> ProdEnvMatACUDAForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r);
+
+template <typename FPTYPE>
+void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
+                                FPTYPE *em_deriv,
+                                FPTYPE *rij,
+                                int *nlist,
+                                const FPTYPE *coord,
+                                const int *type,
+                                const InputNlist &inlist,
+                                const int max_nbor_size,
+                                const FPTYPE *avg,
+                                const FPTYPE *std,
+                                const int nloc,
+                                const int nall,
+                                const float rcut,
+                                const float rcut_smth,
+                                const std::vector<int> sec,
+                                const int *f_type) {
+  if (f_type == NULL) {
+    f_type = type;
+  }
+  const int nnei = sec.back();
+  const int nem = nnei * 4;
+
+  // set & normalize coord
+  std::vector<FPTYPE> d_coord3(nall * 3);
+  for (int ii = 0; ii < nall; ++ii) {
+    for (int dd = 0; dd < 3; ++dd) {
+      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
+    }
+  }
+
+  // set type
+  std::vector<int> d_f_type(nall);
+  for (int ii = 0; ii < nall; ++ii) {
+    d_f_type[ii] = f_type[ii];
+  }
+
+  // build nlist
+  std::vector<std::vector<int>> d_nlist_a(nloc);
+
+  assert(nloc == inlist.inum);
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    d_nlist_a[ii].reserve(max_nbor_size);
+  }
+  for (unsigned ii = 0; ii < nloc; ++ii) {
+    int i_idx = inlist.ilist[ii];
+    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
+      int j_idx = inlist.firstneigh[ii][jj];
+      d_nlist_a[i_idx].push_back(j_idx);
+    }
+  }
+
+#pragma omp parallel for
+  for (int ii = 0; ii < nloc; ++ii) {
+    std::vector<int> fmt_nlist_a;
+    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
+                                 d_nlist_a[ii], rcut, sec);
+    std::vector<FPTYPE> d_em_a;
+    std::vector<FPTYPE> d_em_a_deriv;
+    std::vector<FPTYPE> d_em_r;
+    std::vector<FPTYPE> d_em_r_deriv;
+    std::vector<FPTYPE> d_rij_a;
+    deepmd::env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
+                          fmt_nlist_a, sec, rcut_smth, rcut);
+
+    // check sizes
+    assert(d_em_a.size() == nem);
+    assert(d_em_a_deriv.size() == nem * 3);
+    assert(d_rij_a.size() == nnei * 3);
+    assert(fmt_nlist_a.size() == nnei);
+    // record outputs
+    for (int jj = 0; jj < nem; ++jj) {
+      if (type[ii] >= 0) {
+        em[ii * nem + jj] =
+            (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
+      } else {
+        em[ii * nem + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nem * 3; ++jj) {
+      if (type[ii] >= 0) {
+        em_deriv[ii * nem * 3 + jj] =
+            d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
+      } else {
+        em_deriv[ii * nem * 3 + jj] = 0;
+      }
+    }
+    for (int jj = 0; jj < nnei * 3; ++jj) {
+      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
+    }
+    for (int jj = 0; jj < nnei; ++jj) {
+      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
+    }
+  }
+}
+
+template <typename data_t>
+void prod_env_mat_a_cpu_forward_kernel(int nsamples,
+                                       int nloc,
+                                       int ndescrpt,
+                                       int nnei,
+                                       int nall,
+                                       int mem_cpy,
+                                       int mem_nnei,
+                                       int max_nbor_size,
+                                       const int *mesh_tensor_data,
+                                       int nei_mode,
+                                       float rcut_a,
+                                       float rcut_r,
+                                       float rcut_r_smth,
+                                       int max_cpy_trial,
+                                       int max_nnei_trial,
+                                       bool b_nlist_map,
+                                       const std::vector<int> &sec_a,
+                                       const std::vector<int> &sec_r,
+                                       data_t *p_em,
+                                       data_t *p_em_deriv,
+                                       data_t *p_rij,
+                                       int *p_nlist,
+                                       const data_t *p_coord,
+                                       const data_t *p_box,
+                                       const data_t *avg,
+                                       const data_t *std,
+                                       const int *p_type) {
+  for (size_t ff = 0; ff < nsamples; ++ff) {
+    data_t *em = p_em + ff * nloc * ndescrpt;
+    data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
+    data_t *rij = p_rij + ff * nloc * nnei * 3;
+    int *nlist = p_nlist + ff * nloc * nnei;
+    const data_t *coord = p_coord + ff * nall * 3;
+    const data_t *box = p_box + ff * 9;
+    const int *type = p_type + ff * nall;
+
+    deepmd::InputNlist inlist;
+    // some buffers, be freed after the evaluation of this frame
+    std::vector<int> idx_mapping;
+    std::vector<int> ilist(nloc), numneigh(nloc);
+    std::vector<int *> firstneigh(nloc);
+    std::vector<std::vector<int>> jlist(nloc);
+    std::vector<data_t> coord_cpy;
+    std::vector<int> type_cpy;
+    int frame_nall = nall;
+    // prepare coord and nlist
+    _prepare_coord_nlist_cpu<data_t>(
+        &coord, coord_cpy, &type, type_cpy, idx_mapping, inlist, ilist,
+        numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei,
+        max_nbor_size, box, mesh_tensor_data, nloc, nei_mode, rcut_r,
+        max_cpy_trial, max_nnei_trial);
+    // launch the cpu compute function
+    deepmd::prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist,
+                               max_nbor_size, avg, std, nloc, frame_nall,
+                               rcut_r, rcut_r_smth, sec_a);
+    // do nlist mapping if coords were copied
+    if (b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdEnvMatACPUForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
+  CHECK_INPUT(coord_tensor);
+  CHECK_INPUT(atype_tensor);
+  CHECK_INPUT(natoms_tensor);
+  CHECK_INPUT(box_tensor);
+  CHECK_INPUT(mesh_tensor);
+  CHECK_INPUT(t_avg_tensor);
+  CHECK_INPUT(t_std_tensor);
+
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  int ndescrpt, ndescrpt_a, ndescrpt_r;
+  int nnei, nnei_a, nnei_r, max_nbor_size;
+  int mem_cpy, max_cpy_trial;
+  int mem_nnei, max_nnei_trial;
+  std::string device;
+  // int* array_int = NULL;
+  // unsigned long long* array_longlong = NULL;
+  // deepmd::InputNlist gpu_inlist;
+  // int* nbor_list_dev = NULL;
+  // float nloc_f, nall_f;
+
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
+  ndescrpt_a = sec_a.back() * 4;
+  ndescrpt_r = sec_r.back() * 1;
+  ndescrpt = ndescrpt_a + ndescrpt_r;
+  nnei_a = sec_a.back();
+  nnei_r = sec_r.back();
+  nnei = nnei_a + nnei_r;
+  max_nbor_size = 1024;
+  max_cpy_trial = 100;
+  mem_cpy = 256;
+  max_nnei_trial = 100;
+  mem_nnei = 256;
+
+  CHECK_INPUT_DIM(coord_tensor, 2);
+  CHECK_INPUT_DIM(atype_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+  CHECK_INPUT_DIM(box_tensor, 2);
+  CHECK_INPUT_DIM(mesh_tensor, 1);
+  CHECK_INPUT_DIM(t_avg_tensor, 2);
+  CHECK_INPUT_DIM(t_std_tensor, 2);
+
+  PD_CHECK(sec_r.back() == 0,
+           "Rotational free descriptor only support all-angular information: "
+           "sel_r should be all zero.");
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "Number of atoms should be larger than (or equal to) 3");
+  // Paddle Set device on Python not in custom op
+  const int *natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int ntypes = natoms_tensor.shape()[0] - 2;  // nloc and nall mean something.
+  int nsamples = coord_tensor.shape()[0];
+  // check the sizes
+  PD_CHECK(nsamples == atype_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(ntypes == t_avg_tensor.shape()[0], "number of avg should be ntype");
+  PD_CHECK(ntypes == t_std_tensor.shape()[0], "number of std should be ntype");
+  PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(nall == atype_tensor.shape()[1], "number of atoms should match");
+  PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9");
+  PD_CHECK(ndescrpt == t_avg_tensor.shape()[1],
+           "number of avg should be ndescrpt");
+  PD_CHECK(ndescrpt == t_std_tensor.shape()[1],
+           "number of std should be ndescrpt");
+  PD_CHECK(ntypes == int(sel_a.size()),
+           "number of types should match the length of sel array");
+  PD_CHECK(ntypes == int(sel_r.size()),
+           "number of types should match the length of sel array");
+
+  int nei_mode = 0;
+  bool b_nlist_map = false;
+  if (mesh_tensor.shape()[0] == 16) {
+    // lammps neighbor list
+    nei_mode = 3;
+  } else if (mesh_tensor.shape()[0] == 6) {
+    // manual copied pbc
+    assert(nloc == nall);
+    nei_mode = 1;
+    b_nlist_map = true;
+  } else if (mesh_tensor.shape()[0] == 0) {
+    // no pbc
+    assert(nloc == nall);
+    nei_mode = -1;
+  } else {
+    PD_THROW("Invalid mesh tensor");
+  }
+
+  // Create output tensors shape
+  std::vector<int64_t> descrpt_shape{nsamples, (int64_t)nloc * ndescrpt};
+  std::vector<int64_t> descrpt_deriv_shape{nsamples,
+                                           (int64_t)nloc * ndescrpt * 3};
+  std::vector<int64_t> rij_shape{nsamples, (int64_t)nloc * nnei * 3};
+  std::vector<int64_t> nlist_shape{nsamples, (int64_t)nloc * nnei};
+  // define output tensor
+  paddle::Tensor descrpt_tensor =
+      paddle::empty(descrpt_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor descrpt_deriv_tensor = paddle::empty(
+      descrpt_deriv_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor rij_tensor =
+      paddle::empty(rij_shape, coord_tensor.dtype(), coord_tensor.place());
+
+  paddle::Tensor nlist_tensor =
+      paddle::empty(nlist_shape, paddle::DataType::INT32, coord_tensor.place());
+  PD_DISPATCH_FLOATING_TYPES(
+      coord_tensor.type(), "prod_env_mat_a_cpu_forward_kernel", ([&] {
+        prod_env_mat_a_cpu_forward_kernel<data_t>(
+            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei,
+            max_nbor_size, mesh_tensor.data<int>(), nei_mode, rcut_a, rcut_r,
+            rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a,
+            sec_r, descrpt_tensor.data<data_t>(),
+            descrpt_deriv_tensor.data<data_t>(), rij_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), coord_tensor.data<data_t>(),
+            box_tensor.data<data_t>(), t_avg_tensor.data<data_t>(),
+            t_std_tensor.data<data_t>(), atype_tensor.data<int>());
+      }));
+
+  return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
+}
+
+std::vector<paddle::Tensor> ProdEnvMatAForward(
+    const paddle::Tensor &coord_tensor,
+    const paddle::Tensor &atype_tensor,
+    const paddle::Tensor &mesh_tensor,
+    const paddle::Tensor &box_tensor,
+    const paddle::Tensor &t_avg_tensor,
+    const paddle::Tensor &t_std_tensor,
+    const paddle::Tensor &natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
+  if (coord_tensor.is_gpu()) {
+    return ProdEnvMatACUDAForward(
+        coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor,
+        t_std_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), rcut_a,
+        rcut_r, rcut_r_smth, sel_a, sel_r);
+  } else {
+    return ProdEnvMatACPUForward(
+        coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor,
+        t_std_tensor, natoms_tensor, rcut_a, rcut_r, rcut_r_smth, sel_a, sel_r);
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdEnvMatAInferShape(
+    std::vector<int64_t> coord_shape,
+    std::vector<int64_t> atype_shape,
+    std::vector<int64_t> box_shape,
+    std::vector<int64_t> mesh_shape,
+    std::vector<int64_t> t_avg_shape,
+    std::vector<int64_t> t_std_shape,
+    std::vector<int64_t> natoms_shape,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    const std::vector<int> &sel_a,
+    const std::vector<int> &sel_r) {
+  int64_t nloc = /*natoms[0]*/ 192;
+  //   int64_t nall = /*natoms[1]*/ 192;
+
+  std::vector<int> sec_a;
+  std::vector<int> sec_r;
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
+
+  int64_t nsamples = coord_shape[0];
+  int64_t ndescrpt_a = sec_a.back() * 4;
+  int64_t ndescrpt_r = sec_r.back() * 1;
+  int64_t ndescrpt = ndescrpt_a + ndescrpt_r;
+
+  int64_t nnei_a = sec_a.back();
+  int64_t nnei_r = sec_r.back();
+  int64_t nnei = nnei_a + nnei_r;
+
+  std::vector<int64_t> descrpt_shape = {nsamples, nloc * ndescrpt};
+  std::vector<int64_t> descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3};
+  std::vector<int64_t> rij_shape = {nsamples, nloc * nnei * 3};
+  std::vector<int64_t> nlist_shape = {nsamples, nloc * nnei};
+  return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape};
+}
+
+std::vector<paddle::DataType> ProdEnvMatAInferDtype(
+    paddle::DataType coord_dtype,
+    paddle::DataType atype_dtype,
+    paddle::DataType box_dtype,
+    paddle::DataType mesh_dtype,
+    paddle::DataType t_avg_dtype,
+    paddle::DataType t_std_dtype,
+    paddle::DataType natoms_dtype) {
+  return {coord_dtype, coord_dtype, coord_dtype, coord_dtype};
+}
+
+PD_BUILD_OP(prod_env_mat_a)
+    .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"})
+    .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"})
+    .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float",
+            "sel_a: std::vector<int>", "sel_r: std::vector<int>"})
+    .SetKernelFn(PD_KERNEL(ProdEnvMatAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype));
diff --git a/source/lib/paddle_src/prod_env_mat.cu b/source/lib/paddle_src/paddle_prod_env_mat.cu
similarity index 62%
rename from source/lib/paddle_src/prod_env_mat.cu
rename to source/lib/paddle_src/paddle_prod_env_mat.cu
index 81270a0c81..42f35bb43e 100644
--- a/source/lib/paddle_src/prod_env_mat.cu
+++ b/source/lib/paddle_src/paddle_prod_env_mat.cu
@@ -1,25 +1,28 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_radix_sort.cuh>
 #include <cub/block/block_store.cuh>
+#include <type_traits>
 #include "paddle/extension.h"
 
 #define GOOGLE_CUDA 1
 
 #include <iomanip>
-#include "utilities.h"
+#include <vector>
+
 #include "coord.h"
 #include "fmt_nlist.h"
-#include "region.h"
+#include "gpu_cuda.h"
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
-#include "gpu_cuda.h"
-#include <vector>
+#include "region.h"
+#include "utilities.h"
 
 typedef long long int_64;
 
 #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
-// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
 
 __device__ inline double _sqrt(double x) { return sqrt(x); }
 __device__ inline float _sqrt(float x) { return sqrtf(x); }
@@ -27,71 +30,63 @@ __device__ inline double _rsqrt(double x) { return rsqrt(x); }
 __device__ inline float _rsqrt(float x) { return rsqrtf(x); }
 
 template <typename FPTYPE>
-static int
-_norm_copy_coord_gpu(
-    std::vector<paddle::Tensor>* tensor_list,
-    FPTYPE *&coord_cpy,
-    int *&type_cpy,
-    int *&idx_mapping,
-    int &nall,
-    int &mem_cpy,
-    const FPTYPE *coord,
-    const FPTYPE *box,
-    const int *type,
-    const int &nloc,
-    const int &max_cpy_trial,
-    const float &rcut_r);
+static int _norm_copy_coord_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r);
 
 template <typename FPTYPE>
-static int
-_build_nlist_gpu(
-    std::vector<paddle::Tensor> *tensor_list,
-    int *&ilist,
-    int *&numneigh,
-    int **&firstneigh,
-    int *&jlist,
-    int &max_nnei,
-    int &mem_nnei,
-    const FPTYPE *coord,
-    const int &nloc,
-    const int &new_nall,
-    const int &max_nnei_trial,
-    const float &rcut_r);
-
-static void
-_map_nlist_gpu(
-    int *nlist,
-    const int *idx_mapping,
-    const int &nloc,
-    const int &nnei);
+static int _build_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r);
+
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei);
 
 template <typename FPTYPE>
-static void
-_prepare_coord_nlist_gpu(
-    std::vector<paddle::Tensor> *tensor_list,
-    FPTYPE const **coord,
-    FPTYPE *&coord_cpy,
-    int const **type,
-    int *&type_cpy,
-    int *&idx_mapping,
-    deepmd::InputNlist &inlist,
-    int *&ilist,
-    int *&numneigh,
-    int **&firstneigh,
-    int *&jlist,
-    int *&nbor_list_dev,
-    int &new_nall,
-    int &mem_cpy,
-    int &mem_nnei,
-    int &max_nbor_size,
-    const FPTYPE *box,
-    const int *mesh_tensor_data,
-    const int mesh_tensor_size,
-    const int &nloc,
-    const int &nei_mode,
-    const float &rcut_r,
-    const int &max_cpy_trial,
-    const int &max_nnei_trial);
+static void _prepare_coord_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                     FPTYPE const** coord,
+                                     FPTYPE*& coord_cpy,
+                                     int const** type,
+                                     int*& type_cpy,
+                                     int*& idx_mapping,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
+                                     int& mem_cpy,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
+                                     const FPTYPE* box,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
+                                     const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
+                                     const int& max_cpy_trial,
+                                     const int& max_nnei_trial);
 
 template <typename FPTYPE>
 __device__ inline uint_64 encoding_nbor_info(const int type,
@@ -161,7 +156,6 @@ __launch_bounds__(BLOCK_THREADS) __global__
                                          items);
 }
 
-
 template <typename FPTYPE>
 __device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
   return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
@@ -426,7 +420,6 @@ void format_nbor_list_4096(uint_64* key,
   DPErrcheck(cudaDeviceSynchronize());
 }
 
-
 template <typename FPTYPE, int THREADS_PER_BLOCK>
 __global__ void compute_env_mat_a(FPTYPE* em,
                                   FPTYPE* em_deriv,
@@ -663,7 +656,6 @@ void format_nbor_list_gpu_cuda(int* nlist,
                         sizeof(uint_64) * int_64(nloc) * max_nbor_size));
   DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(),
                         cudaMemcpyHostToDevice));
-
   get_i_idx<<<nblock, LEN>>>(i_idx, nloc, gpu_inlist.ilist);
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
@@ -688,7 +680,7 @@ void format_nbor_list_gpu_cuda(int* nlist,
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
-}
+}  // namespace deepmd
 
 namespace deepmd {
 
@@ -769,94 +761,111 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em,
   DPErrcheck(cudaGetLastError());
   DPErrcheck(cudaDeviceSynchronize());
 }
-}
-
+}  // namespace deepmd
 
 template <typename data_t>
-void prod_env_mat_a_cuda_forward_kernel(
-    int nsamples, int nloc, int ndescrpt, int nnei, int nall, int mem_cpy, int mem_nnei,
-    int max_nbor_size, int nei_mode, float rcut_a, float rcut_r, float rcut_r_smth, int max_cpy_trial,
-    int max_nnei_trial, bool b_nlist_map, const std::vector<int>& sec_a,
-    const std::vector<int>& sec_r, deepmd::InputNlist gpu_inlist, int* nbor_list_dev, int* array_int, unsigned long long* array_longlong,
-    data_t *p_em, data_t *p_em_deriv, data_t *p_rij, int *p_nlist,
-    const data_t *p_coord, const data_t *p_box, const data_t *avg,
-    const data_t *std, const int *p_type, const paddle::Tensor& mesh_tensor)
-{
-
-    for (int ff = 0; ff < nsamples; ++ff)
-    {
-        data_t *em = p_em + ff * nloc * ndescrpt;
-        data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
-        data_t *rij = p_rij + ff * nloc * nnei * 3;
-        int *nlist = p_nlist + ff * nloc * nnei;
-        const data_t *coord = p_coord + ff * nall * 3;
-        const data_t *box = p_box + ff * 9;
-        const int *type = p_type + ff * nall;
-
-
-        int *idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int **firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int *jlist = NULL;
-        data_t *coord_cpy;
-        int *type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.size());
-        std::vector<paddle::Tensor> tensor_list;
-        _prepare_coord_nlist_gpu<data_t>(
-            &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping,
-            gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
-            frame_nall, mem_cpy, mem_nnei, max_nbor_size,
-            box, mesh_tensor.data<int>(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial);
-        // allocate temp memory, temp memory must not be used after this operation!
-        std::vector<int> int_temp_shape{int(sec_a.size()) + nloc * int(sec_a.size()) + nloc};
-        auto int_temp = paddle::empty(
-          int_temp_shape,
-          paddle::DataType::FLOAT32,
-          paddle::GPUPlace()
-        );
-
-        array_int = int_temp.mutable_data<int>();
-
-        deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2);
-        // launch the gpu(nv) compute function
-
-        deepmd::prod_env_mat_a_gpu_cuda(
-            em, em_deriv, rij, nlist,
-            coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a);
-        if (b_nlist_map)
-            _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
-        deepmd::delete_device_memory(firstneigh);
-        deepmd::delete_device_memory(array_longlong);
-        array_longlong = NULL;
-    }
-}
-
-void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel) {
-  sec.resize(n_sel.size() + 1);
-  sec[0] = 0;
-  for (int ii = 1; ii < sec.size(); ++ii) {
-    sec[ii] = sec[ii - 1] + n_sel[ii - 1];
+void prod_env_mat_a_cuda_forward_kernel(int nsamples,
+                                        int nloc,
+                                        int ndescrpt,
+                                        int nnei,
+                                        int nall,
+                                        int mem_cpy,
+                                        int mem_nnei,
+                                        int max_nbor_size,
+                                        int nei_mode,
+                                        float rcut_a,
+                                        float rcut_r,
+                                        float rcut_r_smth,
+                                        int max_cpy_trial,
+                                        int max_nnei_trial,
+                                        bool b_nlist_map,
+                                        const std::vector<int>& sec_a,
+                                        const std::vector<int>& sec_r,
+                                        deepmd::InputNlist gpu_inlist,
+                                        int* nbor_list_dev,
+                                        int* array_int,
+                                        unsigned long long* array_longlong,
+                                        data_t* p_em,
+                                        data_t* p_em_deriv,
+                                        data_t* p_rij,
+                                        int* p_nlist,
+                                        const data_t* p_coord,
+                                        const data_t* p_box,
+                                        const data_t* avg,
+                                        const data_t* std,
+                                        const int* p_type,
+                                        const paddle::Tensor& mesh_tensor) {
+  for (int ff = 0; ff < nsamples; ++ff) {
+    data_t* em = p_em + ff * nloc * ndescrpt;
+    data_t* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3;
+    data_t* rij = p_rij + ff * nloc * nnei * 3;
+    int* nlist = p_nlist + ff * nloc * nnei;
+    const data_t* coord = p_coord + ff * nall * 3;
+    const data_t* box = p_box + ff * 9;
+    const int* type = p_type + ff * nall;
+
+    int* idx_mapping = NULL;
+    int *ilist = NULL, *numneigh = NULL;
+    int** firstneigh = NULL;
+    deepmd::malloc_device_memory(firstneigh, nloc);
+    int* jlist = NULL;
+    data_t* coord_cpy;
+    int* type_cpy;
+    int frame_nall = nall;
+    int mesh_tensor_size = static_cast<int>(mesh_tensor.size());
+    std::vector<paddle::Tensor> tensor_list;
+    // std::vector<paddle::Tensor> tensor_list(7); // >>>
+    // 参照deepmd-kit-tf/source/op/prod_env_mat_multi_device.cc修改
+    _prepare_coord_nlist_gpu<data_t>(
+        &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping,
+        gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev,
+        frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
+        mesh_tensor.data<int>(), mesh_tensor_size, nloc, nei_mode, rcut_r,
+        max_cpy_trial, max_nnei_trial);
+
+    // allocate temp memory, temp memory must not be used after this operation!
+    std::vector<int> int_temp_shape{int(sec_a.size()) +
+                                    nloc * int(sec_a.size()) + nloc};
+    auto int_temp = paddle::empty(int_temp_shape, paddle::DataType::INT32,
+                                  paddle::GPUPlace());
+
+    array_int = int_temp.data<int>();
+
+    deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2);
+    // launch the gpu(nv) compute function
+
+    deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
+                                    gpu_inlist, array_int, array_longlong,
+                                    max_nbor_size, avg, std, nloc, frame_nall,
+                                    rcut_r, rcut_r_smth, sec_a);
+    if (b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
+    deepmd::delete_device_memory(firstneigh);
+    deepmd::delete_device_memory(array_longlong);
+    array_longlong = NULL;
   }
 }
 
-
-std::vector<paddle::Tensor> prod_env_mat_a_cuda_forward(
-  const paddle::Tensor& coord_tensor,
-  const paddle::Tensor& atype_tensor,
-  const paddle::Tensor& box_tensor,
-  const paddle::Tensor& mesh_tensor,
-  const paddle::Tensor& t_avg_tensor,
-  const paddle::Tensor& t_std_tensor,
-  const paddle::Tensor& natoms_tensor,
-  float rcut_a,
-  float rcut_r,
-  float rcut_r_smth,
-  std::vector<int> sel_a,
-  std::vector<int> sel_r
-)
-{
+// void cum_sum(std::vector<int>& sec, const std::vector<int>& n_sel) {
+//   sec.resize(n_sel.size() + 1);
+//   sec[0] = 0;
+//   for (int ii = 1; ii < sec.size(); ++ii) {
+//     sec[ii] = sec[ii - 1] + n_sel[ii - 1];
+//   }
+// }
+
+std::vector<paddle::Tensor> ProdEnvMatACUDAForward(
+    const paddle::Tensor& coord_tensor,
+    const paddle::Tensor& atype_tensor,
+    const paddle::Tensor& box_tensor,
+    const paddle::Tensor& mesh_tensor,
+    const paddle::Tensor& t_avg_tensor,
+    const paddle::Tensor& t_std_tensor,
+    const paddle::Tensor& natoms_tensor,
+    float rcut_a,
+    float rcut_r,
+    float rcut_r_smth,
+    std::vector<int> sel_a,
+    std::vector<int> sel_r) {
   std::vector<int> sec_a;
   std::vector<int> sec_r;
   int ndescrpt, ndescrpt_a, ndescrpt_r;
@@ -864,18 +873,17 @@ std::vector<paddle::Tensor> prod_env_mat_a_cuda_forward(
   int mem_cpy, max_cpy_trial;
   int mem_nnei, max_nnei_trial;
   std::string device;
-  int *array_int = NULL;
-  unsigned long long *array_longlong = NULL;
+  int* array_int = NULL;
+  unsigned long long* array_longlong = NULL;
   deepmd::InputNlist gpu_inlist;
-  int *nbor_list_dev = NULL;
-  float nloc_f, nall_f;
+  int* nbor_list_dev = NULL;
+  //   float nloc_f, nall_f;
 
-  cum_sum(sec_a, sel_a);
-  cum_sum(sec_r, sel_r);
+  deepmd::cum_sum(sec_a, sel_a);
+  deepmd::cum_sum(sec_r, sel_r);
   ndescrpt_a = sec_a.back() * 4;
   ndescrpt_r = sec_r.back() * 1;
   ndescrpt = ndescrpt_a + ndescrpt_r;
-  // std::cout << "ndescrpt = " << ndescrpt << std::endl;
   nnei_a = sec_a.back();
   nnei_r = sec_r.back();
   nnei = nnei_a + nnei_r;
@@ -884,32 +892,11 @@ std::vector<paddle::Tensor> prod_env_mat_a_cuda_forward(
   mem_cpy = 256;
   max_nnei_trial = 100;
   mem_nnei = 256;
-  // std::cout << "natoms.dtype = " << natoms.dtype() << std::endl;
-  // std::cout << "natoms.shape = ";
-  // for (auto &x: natoms)
-  // {
-  //   std::cout << x << std::endl;
-  // }
-  // std::cout << std::endl;
-
-  // std::cout <<  << std::endl;
-  // std::cout << "natoms.numel = " << natoms.numel() << std::endl;
-  // std::cout << "ckpt 1===============" << std::endl;
-  // auto* natoms = natoms.data<int>();
-  // std::cout << "natoms.numel() = " << natoms.numel() << std::endl;
-  // std::cout << "ckpt 2===============" << std::endl;
-  // std::cout << natoms[0] << std::endl;
+  CHECK_INPUT_ON_CPU(natoms_tensor);
   auto natoms = natoms_tensor.data<int>();
-  int nloc = natoms[0]; // TODO: 使用natoms[0] 会段错误
-  // std::cout << "nloc = " << nloc << std::endl;
-  // std::cout << "ckpt 3===============" << std::endl;
-  int nall = natoms[1]; // TODO: 使用natoms[1] 会段错误
-  // std::cout << "nall = " << nloc << std::endl;
-  // std::cout << "ckpt 4===============" << std::endl;
-  // int ntypes = natoms.shape()[0] - 2;
-  // std::cout << "ckpt 5===============" << std::endl;
+  int nloc = natoms[0];
+  int nall = natoms[1];
   int nsamples = coord_tensor.shape()[0];
-  // std::cout << "ckpt 6===============" << std::endl;
 
   int nei_mode = 0;
   bool b_nlist_map = false;
@@ -931,394 +918,261 @@ std::vector<paddle::Tensor> prod_env_mat_a_cuda_forward(
 
   // create output tensors
   auto descrpt_tensor = paddle::empty(
-    {nsamples, nloc * ndescrpt},
-    coord_tensor.dtype(),
-    coord_tensor.place()
-  );
-  // std::cout << "descrpt_tensor.shape = ";
-  // for (auto &x: descrpt_tensor.shape())
-  //   std::cout << x << " ";
-  // std::cout << std::endl;
-
-  auto descrpt_deriv_tensor = paddle::empty(
-    {nsamples, nloc * ndescrpt * 3},
-    coord_tensor.dtype(),
-    coord_tensor.place()
-  );
-  // std::cout << "descrpt_deriv_tensor.shape = ";
-  // for (auto &x: descrpt_deriv_tensor.shape())
-  //   std::cout << x << " ";
-  // std::cout << std::endl;
-
-  auto rij_tensor = paddle::empty(
-    {nsamples, nloc * nnei * 3},
-    coord_tensor.dtype(),
-    coord_tensor.place()
-  );
-  // std::cout << "rij_tensor.shape = ";
-  // for (auto &x: rij_tensor.shape())
-  //   std::cout << x << " ";
-  // std::cout << std::endl;
+      {nsamples, nloc * ndescrpt}, coord_tensor.dtype(), coord_tensor.place());
+
+  auto descrpt_deriv_tensor =
+      paddle::empty({nsamples, nloc * ndescrpt * 3}, coord_tensor.dtype(),
+                    coord_tensor.place());
+
+  auto rij_tensor = paddle::empty({nsamples, nloc * nnei * 3},
+                                  coord_tensor.dtype(), coord_tensor.place());
 
   auto nlist_tensor = paddle::empty(
-    {nsamples, nloc * nnei},
-    coord_tensor.dtype(),
-    coord_tensor.place()
-  );
-  // std::cout << "nlist_tensor.shape = ";
-  // for (auto &x: nlist_tensor.shape())
-  //   std::cout << x << " ";
-  // std::cout << std::endl;
+      {nsamples, nloc * nnei}, paddle::DataType::INT32, coord_tensor.place());
 
   // loop over samples
   PD_DISPATCH_FLOATING_TYPES(
-    coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] {
+      coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] {
         prod_env_mat_a_cuda_forward_kernel<data_t>(
-            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size,
-            nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r,
-            gpu_inlist, nbor_list_dev, array_int, array_longlong,
-            descrpt_tensor.mutable_data<data_t>(),
-            descrpt_deriv_tensor.mutable_data<data_t>(),
-            rij_tensor.mutable_data<data_t>(),
-            nlist_tensor.mutable_data<int>(),
+            nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei,
+            max_nbor_size, nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial,
+            max_nnei_trial, b_nlist_map, sec_a, sec_r, gpu_inlist,
+            nbor_list_dev, array_int, array_longlong,
+            descrpt_tensor.data<data_t>(), descrpt_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
             coord_tensor.data<data_t>(),
             box_tensor.copy_to(paddle::CPUPlace(), false).data<data_t>(),
-            t_avg_tensor.data<data_t>(),
-            t_std_tensor.data<data_t>(),
-            atype_tensor.data<int>(),
-            mesh_tensor);
-    }));
+            // box_tensor.data<data_t>(),
+            t_avg_tensor.data<data_t>(), t_std_tensor.data<data_t>(),
+            atype_tensor.data<int>(), mesh_tensor);
+      }));
   return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor};
 }
 
 template <typename FPTYPE>
-static int
-_norm_copy_coord_gpu(
-    std::vector<paddle::Tensor>* tensor_list,
-    FPTYPE *&coord_cpy,
-    int *&type_cpy,
-    int *&idx_mapping,
-    int &nall,
-    int &mem_cpy,
-    const FPTYPE *coord,
-    const FPTYPE *box,
-    const int *type,
-    const int &nloc,
-    const int &max_cpy_trial,
-    const float &rcut_r)
-{
-    // Tensor FPTYPE_temp;
-    std::vector<int64_t> FPTYPE_temp_shape{nall*3};
-    paddle::Tensor tmp_coord_tensor = paddle::Tensor(paddle::PlaceType::kGPU, FPTYPE_temp_shape);
-    FPTYPE *tmp_coord = tmp_coord_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
-    tensor_list->push_back(tmp_coord_tensor);
-    cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice);
-
-    deepmd::Region<FPTYPE> region;
-    deepmd::init_region_cpu(region, box);
-    FPTYPE box_info[18];
-    std::copy(region.boxt, region.boxt + 9, box_info);
-    std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
-    int cell_info[23];
-    deepmd::compute_cell_info(cell_info, rcut_r, region);
-    const int loc_cellnum = cell_info[21];
-    const int total_cellnum = cell_info[22];
-
-    //Tensor double_temp;
-    std::vector<int64_t> double_temp_shape {18};
-    paddle::Tensor double_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, double_temp_shape);
-    FPTYPE *box_info_dev = double_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
-    tensor_list->push_back(double_temp_tensor);
-
-    //Tensor int_temp;
-    std::vector<int64_t> int_temp_shape {23+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc};
-    paddle::Tensor int_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, int_temp_shape);
-    int *cell_info_dev = int_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
-    int *int_data_dev = cell_info_dev + 23;
-    tensor_list->push_back(int_temp_tensor);
-
-    deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
-    deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
-
-    deepmd::Region<FPTYPE> region_dev;
-    FPTYPE *new_boxt = region_dev.boxt;
-    FPTYPE *new_rec_boxt = region_dev.rec_boxt;
-    region_dev.boxt = box_info_dev;
-    region_dev.rec_boxt = box_info_dev + 9;
-
-    deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
-
-
-    int tt;
-    paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
-    paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
-    for (tt = 0; tt < max_cpy_trial; ++tt)
-    {
-        std::vector<int64_t> cpy_temp_shape {mem_cpy * 3};
-        std::vector<int64_t> t_temp_shape {mem_cpy * 2};
-        cpy_temp_tensor.reshape(cpy_temp_shape);
-        coord_cpy = cpy_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
-        t_temp_tensor.reshape(t_temp_shape);
-        type_cpy = t_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
-
-        idx_mapping = type_cpy + mem_cpy;
-        int ret = deepmd::copy_coord_gpu(
-            coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev,
-            tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
-        if (ret == 0)
-        {
-            break;
-        }
-        else
-        {
-            mem_cpy *= 2;
-        }
+static int _norm_copy_coord_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                FPTYPE*& coord_cpy,
+                                int*& type_cpy,
+                                int*& idx_mapping,
+                                int& nall,
+                                int& mem_cpy,
+                                const FPTYPE* coord,
+                                const FPTYPE* box,
+                                const int* type,
+                                const int& nloc,
+                                const int& max_cpy_trial,
+                                const float& rcut_r) {
+  // Tensor FPTYPE_temp;
+  std::vector<int64_t> FPTYPE_temp_shape{nall * 3};
+
+
+  // use type trait to determine the data type
+  paddle::Tensor tmp_coord_tensor;
+  if (std::is_same<FPTYPE, float>::value) {
+    tmp_coord_tensor = paddle::empty(
+      FPTYPE_temp_shape, paddle::DataType::FLOAT32, paddle::GPUPlace()
+    );
+  } else if (std::is_same<FPTYPE, double>::value) {
+    tmp_coord_tensor = paddle::empty(
+      FPTYPE_temp_shape, paddle::DataType::FLOAT64, paddle::GPUPlace()
+    );
+  } else {
+    PD_THROW("invalid data type");
+  }
+  FPTYPE* tmp_coord = tmp_coord_tensor.data<FPTYPE>();
+  tensor_list->push_back(tmp_coord_tensor);
+  cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
+             cudaMemcpyDeviceToDevice);
+
+  deepmd::Region<FPTYPE> region;
+  deepmd::init_region_cpu(region, box);
+  FPTYPE box_info[18];
+  std::copy(region.boxt, region.boxt + 9, box_info);
+  std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
+  int cell_info[23];
+  deepmd::compute_cell_info(cell_info, rcut_r, region);
+  const int loc_cellnum = cell_info[21];
+  const int total_cellnum = cell_info[22];
+
+  // Tensor double_temp;
+  std::vector<int64_t> double_temp_shape{18};
+  paddle::Tensor double_temp_tensor = paddle::empty(
+      double_temp_shape, tmp_coord_tensor.dtype(), paddle::GPUPlace());
+  FPTYPE* box_info_dev = double_temp_tensor.data<FPTYPE>();
+  tensor_list->push_back(double_temp_tensor);
+
+  // Tensor int_temp;
+  std::vector<int64_t> int_temp_shape{
+      23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 +
+      loc_cellnum + 1 + total_cellnum + 1 + nloc};
+  paddle::Tensor int_temp_tensor = paddle::empty(
+      int_temp_shape, paddle::DataType::INT32, paddle::GPUPlace());
+  int* cell_info_dev = int_temp_tensor.data<int>();
+  int* int_data_dev = cell_info_dev + 23;
+  tensor_list->push_back(int_temp_tensor);
+
+  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
+  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
+
+  deepmd::Region<FPTYPE> region_dev;
+  FPTYPE* new_boxt = region_dev.boxt;
+  FPTYPE* new_rec_boxt = region_dev.rec_boxt;
+  region_dev.boxt = box_info_dev;
+  region_dev.rec_boxt = box_info_dev + 9;
+
+  deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
+
+  int tt;
+  paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  for (tt = 0; tt < max_cpy_trial; ++tt) {
+    std::vector<int64_t> cpy_temp_shape{mem_cpy * 3};
+    std::vector<int64_t> t_temp_shape{mem_cpy * 2};
+    cpy_temp_tensor.reshape(cpy_temp_shape);
+    coord_cpy = cpy_temp_tensor.mutable_data<FPTYPE>(paddle::PlaceType::kGPU);
+    t_temp_tensor.reshape(t_temp_shape);
+    type_cpy = t_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+
+    idx_mapping = type_cpy + mem_cpy;
+    int ret = deepmd::copy_coord_gpu(
+        coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type,
+        nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_cpy *= 2;
     }
-    tensor_list->push_back(cpy_temp_tensor);
-    tensor_list->push_back(t_temp_tensor);
-    region_dev.boxt = new_boxt;
-    region_dev.rec_boxt = new_rec_boxt;
+  }
+  tensor_list->push_back(cpy_temp_tensor);
+  tensor_list->push_back(t_temp_tensor);
+  region_dev.boxt = new_boxt;
+  region_dev.rec_boxt = new_rec_boxt;
 
-    return (tt != max_cpy_trial);
+  return (tt != max_cpy_trial);
 }
 
 template <typename FPTYPE>
-static int
-_build_nlist_gpu(
-    std::vector<paddle::Tensor> *tensor_list,
-    int *&ilist,
-    int *&numneigh,
-    int **&firstneigh,
-    int *&jlist,
-    int &max_nnei,
-    int &mem_nnei,
-    const FPTYPE *coord,
-    const int &nloc,
-    const int &new_nall,
-    const int &max_nnei_trial,
-    const float &rcut_r)
-{
-    //Tensor nlist_temp;
-    std::vector<int64_t> nlist_temp_shape {nloc * 2};
-    paddle::Tensor nlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, nlist_temp_shape);
-    ilist = nlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
-    tensor_list->push_back(nlist_temp_tensor);
-    numneigh = ilist + nloc;
-    //Tensor jlist_temp;
-    int *ind_data = NULL;
-
-    std::vector<int *> firstneigh_host(nloc);
-    int tt;
-    paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
-    for (tt = 0; tt < max_nnei_trial; ++tt)
-    {
-        std::vector<int64_t> jlist_temp_shape {3 * nloc * mem_nnei};
-        jlist_temp_tensor.reshape(jlist_temp_shape);
-        jlist = jlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
-        ind_data = jlist + nloc * mem_nnei;
-        for (int ii = 0; ii < nloc; ++ii)
-        {
-            firstneigh_host[ii] = jlist + ii * mem_nnei;
-        }
-        deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
-        deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
-        int ret = deepmd::build_nlist_gpu(
-            inlist, &max_nnei, ind_data,
-            coord, nloc, new_nall, mem_nnei, rcut_r);
-        if (ret == 0)
-        {
-            break;
-        }
-        else
-        {
-            mem_nnei *= 2;
-        }
+static int _build_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                            int*& ilist,
+                            int*& numneigh,
+                            int**& firstneigh,
+                            int*& jlist,
+                            int& max_nnei,
+                            int& mem_nnei,
+                            const FPTYPE* coord,
+                            const int& nloc,
+                            const int& new_nall,
+                            const int& max_nnei_trial,
+                            const float& rcut_r) {
+  // Tensor nlist_temp;
+  std::vector<int64_t> nlist_temp_shape{nloc * 2};
+  paddle::Tensor nlist_temp_tensor = paddle::empty(
+      nlist_temp_shape, paddle::DataType::INT32, paddle::GPUPlace());
+  ilist = nlist_temp_tensor.data<int>();
+  tensor_list->push_back(nlist_temp_tensor);
+  numneigh = ilist + nloc;
+  // Tensor jlist_temp;
+  int* ind_data = NULL;
+
+  std::vector<int*> firstneigh_host(nloc);
+  int tt;
+  paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU);
+  for (tt = 0; tt < max_nnei_trial; ++tt) {
+    std::vector<int64_t> jlist_temp_shape{3 * nloc * mem_nnei};
+    jlist_temp_tensor.reshape(jlist_temp_shape);
+    jlist = jlist_temp_tensor.mutable_data<int>(paddle::PlaceType::kGPU);
+    ind_data = jlist + nloc * mem_nnei;
+    for (int ii = 0; ii < nloc; ++ii) {
+      firstneigh_host[ii] = jlist + ii * mem_nnei;
+    }
+    deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
+    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
+    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
+                                      new_nall, mem_nnei, rcut_r);
+    if (ret == 0) {
+      break;
+    } else {
+      mem_nnei *= 2;
     }
-    tensor_list->push_back(jlist_temp_tensor);
-    return (tt != max_nnei_trial);
+  }
+  tensor_list->push_back(jlist_temp_tensor);
+  return (tt != max_nnei_trial);
 }
 
-static void
-_map_nlist_gpu(
-    int *nlist,
-    const int *idx_mapping,
-    const int &nloc,
-    const int &nnei)
-{
-    deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
+static void _map_nlist_gpu(int* nlist,
+                           const int* idx_mapping,
+                           const int& nloc,
+                           const int& nnei) {
+  deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
 }
 
 template <typename FPTYPE>
-static void
-_prepare_coord_nlist_gpu(
-    std::vector<paddle::Tensor> *tensor_list,
-    FPTYPE const **coord,
-    FPTYPE *&coord_cpy,
-    int const **type,
-    int *&type_cpy,
-    int *&idx_mapping,
-    deepmd::InputNlist &inlist,
-    int *&ilist,
-    int *&numneigh,
-    int **&firstneigh,
-    int *&jlist,
-    int *&nbor_list_dev,
-    int &new_nall,
-    int &mem_cpy,
-    int &mem_nnei,
-    int &max_nbor_size,
-    const FPTYPE *box,
-    const int *mesh_tensor_data,
-    const int mesh_tensor_size,
-    const int &nloc,
-    const int &nei_mode,
-    const float &rcut_r,
-    const int &max_cpy_trial,
-    const int &max_nnei_trial)
-{
-    inlist.inum = nloc;
-    if (nei_mode != 3)
-    {
-        // build nlist by myself
-        // normalize and copy coord
-        if (nei_mode == 1)
-        {
-            int copy_ok = _norm_copy_coord_gpu(
-                tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
-                *coord, box, *type, nloc, max_cpy_trial, rcut_r);
-            PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
-            *coord = coord_cpy;
-            *type = type_cpy;
-
-        }
-
-        //build nlist
-        int build_ok = _build_nlist_gpu(
-            tensor_list, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei,
-            *coord, nloc, new_nall, max_nnei_trial, rcut_r);
-        PD_CHECK(build_ok, "cannot allocate mem for nlist");
-        if (max_nbor_size <= 1024)
-        {
-            max_nbor_size = 1024;
-        }
-        else if (max_nbor_size <= 2048)
-        {
-            max_nbor_size = 2048;
-        }
-        else
-        {
-            max_nbor_size = 4096;
-        }
-        inlist.ilist = ilist;
-        inlist.numneigh = numneigh;
-        inlist.firstneigh = firstneigh;
+static void _prepare_coord_nlist_gpu(std::vector<paddle::Tensor>* tensor_list,
+                                     FPTYPE const** coord,
+                                     FPTYPE*& coord_cpy,
+                                     int const** type,
+                                     int*& type_cpy,
+                                     int*& idx_mapping,
+                                     deepmd::InputNlist& inlist,
+                                     int*& ilist,
+                                     int*& numneigh,
+                                     int**& firstneigh,
+                                     int*& jlist,
+                                     int*& nbor_list_dev,
+                                     int& new_nall,
+                                     int& mem_cpy,
+                                     int& mem_nnei,
+                                     int& max_nbor_size,
+                                     const FPTYPE* box,
+                                     const int* mesh_tensor_data,
+                                     const int mesh_tensor_size,
+                                     const int& nloc,
+                                     const int& nei_mode,
+                                     const float& rcut_r,
+                                     const int& max_cpy_trial,
+                                     const int& max_nnei_trial) {
+  inlist.inum = nloc;
+  if (nei_mode != 3) {
+    // build nlist by myself
+    // normalize and copy coord
+    if (nei_mode == 1) {
+      int copy_ok = _norm_copy_coord_gpu(
+          tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy,
+          *coord, box, *type, nloc, max_cpy_trial, rcut_r);
+      PD_CHECK(copy_ok, "cannot allocate mem for copied coords");
+      *coord = coord_cpy;
+      *type = type_cpy;
     }
-    else
-    {
-        // update nbor list
-        deepmd::InputNlist inlist_temp;
-        inlist_temp.inum = nloc;
-        deepmd::env_mat_nbor_update(
-            inlist_temp, inlist, max_nbor_size, nbor_list_dev,
-            mesh_tensor_data, mesh_tensor_size);
-        // env_mat_nbor_update(
-        //     inlist_temp, inlist, max_nbor_size, nbor_list_dev,
-        //     mesh_tensor_data, mesh_tensor_size);
-        PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), "Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit.");
-    }
-}
-
 
-std::vector<paddle::Tensor> ProdEnvMatAForward(
-  const paddle::Tensor& coord_tensor,
-  const paddle::Tensor& atype_tensor,
-  const paddle::Tensor& mesh_tensor,
-  const paddle::Tensor& box_tensor,
-  const paddle::Tensor& t_avg_tensor,
-  const paddle::Tensor& t_std_tensor,
-  const paddle::Tensor& natoms_tensor,
-  float rcut_a,
-  float rcut_r,
-  float rcut_r_smth,
-  std::vector<int> sel_a,
-  std::vector<int> sel_r
-) {
-  if (coord_tensor.is_gpu()) {
-    return prod_env_mat_a_cuda_forward(
-      coord_tensor,
-      atype_tensor,
-      mesh_tensor,
-      box_tensor,
-      t_avg_tensor,
-      t_std_tensor,
-      natoms_tensor,
-      rcut_a,
-      rcut_r,
-      rcut_r_smth,
-      sel_a,
-      sel_r
-    );
+    // build nlist
+    int build_ok = _build_nlist_gpu(tensor_list, ilist, numneigh, firstneigh,
+                                    jlist, max_nbor_size, mem_nnei, *coord,
+                                    nloc, new_nall, max_nnei_trial, rcut_r);
+    PD_CHECK(build_ok, "cannot allocate mem for nlist");
+    if (max_nbor_size <= 1024) {
+      max_nbor_size = 1024;
+    } else if (max_nbor_size <= 2048) {
+      max_nbor_size = 2048;
+    } else {
+      max_nbor_size = 4096;
+    }
+    inlist.ilist = ilist;
+    inlist.numneigh = numneigh;
+    inlist.firstneigh = firstneigh;
   } else {
-    PD_THROW("Unsupported device type for forward function of custom relu operator.");
+    // update nbor list
+    deepmd::InputNlist inlist_temp;
+    inlist_temp.inum = nloc;
+    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
+                                nbor_list_dev, mesh_tensor_data,
+                                mesh_tensor_size);
+    // env_mat_nbor_update(
+    //     inlist_temp, inlist, max_nbor_size, nbor_list_dev,
+    //     mesh_tensor_data, mesh_tensor_size);
+    PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE),
+             "Assert failed, max neighbor size of atom(lammps) " +
+                 std::to_string(max_numneigh(inlist_temp)) +
+                 " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) +
+                 ", which currently is not supported by deepmd-kit.");
   }
 }
-
-
-std::vector<std::vector<int64_t>> ProdEnvMatAInferShape(
-  std::vector<int64_t> coord_shape,
-  std::vector<int64_t> atype_shape,
-  std::vector<int64_t> box_shape,
-  std::vector<int64_t> mesh_shape,
-  std::vector<int64_t> t_avg_shape,
-  std::vector<int64_t> t_std_shape,
-  std::vector<int64_t> natoms_shape,
-  float rcut_a,
-  float rcut_r,
-  float rcut_r_smth,
-  const std::vector<int>& sel_a,
-  const std::vector<int>& sel_r
-) {
-  int64_t nloc = /*natoms[0]*/ 192;
-  int64_t nall = /*natoms[1]*/ 192;
-
-  std::vector<int> sec_a;
-  std::vector<int> sec_r;
-  cum_sum(sec_a, sel_a);
-  cum_sum(sec_r, sel_r);
-
-  int64_t nsamples = coord_shape[0];
-  int64_t ndescrpt_a = sec_a.back() * 4;
-  int64_t ndescrpt_r = sec_r.back() * 1;
-  int64_t ndescrpt = ndescrpt_a + ndescrpt_r;
-
-  int64_t nnei_a = sec_a.back();
-  int64_t nnei_r = sec_r.back();
-  int64_t nnei = nnei_a + nnei_r;
-
-  std::vector<int64_t> descrpt_shape = {nsamples, nloc * ndescrpt};
-  std::vector<int64_t> descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3};
-  std::vector<int64_t> rij_shape = {nsamples, nloc * nnei * 3};
-  std::vector<int64_t> nlist_shape = {nsamples, nloc * nnei};
-  return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape};
-}
-
-std::vector<paddle::DataType> ProdEnvMatAInferDtype(
-  paddle::DataType coord_dtype,
-  paddle::DataType atype_dtype,
-  paddle::DataType box_dtype,
-  paddle::DataType mesh_dtype,
-  paddle::DataType t_avg_dtype,
-  paddle::DataType t_std_dtype,
-  paddle::DataType natoms_dtype
-) {
-  return {coord_dtype, coord_dtype, coord_dtype, coord_dtype};
-}
-
-
-PD_BUILD_OP(prod_env_mat_a)
-    .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"})
-    .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"})
-    .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float", "sel_a: std::vector<int>", "sel_r: std::vector<int>"})
-    .SetKernelFn(PD_KERNEL(ProdEnvMatAForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype));
\ No newline at end of file
diff --git a/source/lib/paddle_src/paddle_prod_force.cc b/source/lib/paddle_src/paddle_prod_force.cc
new file mode 100644
index 0000000000..95c6671984
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force.cc
@@ -0,0 +1,142 @@
+#include "paddle/extension.h"
+#include "prod_force.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+
+template <typename data_t>
+void ProdForceSeAOpForwardCPUKernel(int nloc,
+                                    int nall,
+                                    int nframes,
+                                    int ndescrpt,
+                                    int nnei,
+                                    data_t* p_force,
+                                    const data_t* p_net_deriv,
+                                    const data_t* p_in_deriv,
+                                    const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* force = p_force + kk * nall * 3;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_a_cpu(force, net_deriv, in_deriv, nlist, nloc, nall,
+                             nnei, 0);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCPUForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT(natoms_tensor);
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  int nnei = nlist_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+
+  std::vector<int64_t> force_shape{nframes, 3 * nall};
+  paddle::Tensor force_tensor = paddle::empty(
+      force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+
+  assert(nframes == force_shape[0]);
+  assert(nframes == net_deriv_tensor.shape()[0]);
+  assert(nframes == in_deriv_tensor.shape()[0]);
+  assert(nframes == nlist_tensor.shape()[0]);
+  assert(nall * 3 == force_shape[1]);
+  assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]);
+  assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
+  assert(nloc * nnei == nlist_tensor.shape()[1]);
+  assert(nnei * 4 == ndescrpt);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_force_se_a_cpu_forward_kernel", ([&] {
+        ProdForceSeAOpForwardCPUKernel<data_t>(
+            nloc, nall, nframes, ndescrpt, nnei,
+            force_tensor.mutable_data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
+
+  return {force_tensor};
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdForceSeAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+    return ProdForceSeAOpCUDAForward(net_deriv_tensor, in_deriv_tensor,
+                                     nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel,
+                                     n_r_sel);
+  } else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+    return ProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor,
+                                    nlist_tensor, natoms_tensor, n_a_sel,
+                                    n_r_sel);
+  } else {
+    PD_THROW("No Such kernel for ProdForceSeAForward.");
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdForceSeAInferShape(
+    std::vector<int64_t> net_deriv_shape,
+    std::vector<int64_t> in_deriv_shape,
+    std::vector<int64_t> nlist_shape,
+    std::vector<int64_t> natoms_shape,
+    const int& n_a_sel,
+    const int& n_r_sel) {
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+  std::vector<int64_t> force_shape = {nframes, 3 * nall};
+  return {force_shape};
+}
+
+std::vector<paddle::DataType> ProdForceSeAInferDtype(
+    paddle::DataType net_deriv_dtype,
+    paddle::DataType in_deriv_dtype,
+    paddle::DataType nlist_dtype,
+    paddle::DataType natoms_dtype) {
+  return {net_deriv_dtype};
+}
+
+PD_BUILD_OP(prod_force_se_a)
+    .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({"force"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdForceSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdForceSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdForceSeAInferDtype));
diff --git a/source/lib/paddle_src/prod_force.cu b/source/lib/paddle_src/paddle_prod_force.cu
similarity index 65%
rename from source/lib/paddle_src/prod_force.cu
rename to source/lib/paddle_src/paddle_prod_force.cu
index 4416cef082..a767ce6f3f 100644
--- a/source/lib/paddle_src/prod_force.cu
+++ b/source/lib/paddle_src/paddle_prod_force.cu
@@ -1,11 +1,12 @@
-#include "paddle/extension.h"
-
 #include "device.h"
-#include "prod_force.h"
 #include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_force.h"
 
 #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
 
 template <typename FPTYPE, int THREADS_PER_BLOCK>
 __global__ void force_deriv_wrt_center_atom(FPTYPE* force,
@@ -180,33 +181,33 @@ template void prod_force_r_gpu_cuda<double>(double* force,
                                             const int nnei);
 }  // namespace deepmd
 
-
 template <typename data_t>
-void PdProdForceSeAOpForwardCUDAKernel(
-  int nloc, int nall, int nframes, int ndescrpt, int nnei,
-  data_t* p_force, const data_t* p_net_deriv, const data_t* p_in_deriv, const int* p_nlist
-) {
-  for(int kk = 0; kk < nframes; ++kk){
-    data_t * force = p_force + kk * nall * 3;
-    const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
-    const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-    const int * nlist = p_nlist + kk * nloc * nnei;
-    deepmd::prod_force_a_gpu_cuda(
-        force,
-        net_deriv, in_deriv, nlist, nloc, nall, nnei
-    );
+void ProdForceSeAOpForwardCUDAKernel(int nloc,
+                                     int nall,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     data_t* p_force,
+                                     const data_t* p_net_deriv,
+                                     const data_t* p_in_deriv,
+                                     const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* force = p_force + kk * nall * 3;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_a_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc, nall,
+                                  nnei);
   }
 }
 
-
-std::vector<paddle::Tensor> PdProdForceSeAOpCUDAForward(
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
+std::vector<paddle::Tensor> ProdForceSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
   CHECK_INPUT(net_deriv_tensor);
   CHECK_INPUT(in_deriv_tensor);
   CHECK_INPUT(nlist_tensor);
@@ -215,7 +216,10 @@ std::vector<paddle::Tensor> PdProdForceSeAOpCUDAForward(
   CHECK_INPUT_DIM(in_deriv_tensor, 2);
   CHECK_INPUT_DIM(natoms_tensor, 1);
 
-  PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+  CHECK_INPUT_ON_CPU(natoms_tensor);
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
   const int* natoms = natoms_tensor.data<int>();
   int nloc = natoms[0];
   int nall = natoms[1];
@@ -223,81 +227,34 @@ std::vector<paddle::Tensor> PdProdForceSeAOpCUDAForward(
   int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
   int nnei = nlist_tensor.shape()[1] / nloc;
 
-  PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
-  PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
-  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
 
-  std::vector<int64_t> force_shape {nframes, 3 * nall};
-  paddle::Tensor force_tensor = paddle::Tensor(paddle::PlaceType::kGPU, force_shape);
+  std::vector<int64_t> force_shape{nframes, 3 * nall};
+  paddle::Tensor force_tensor = paddle::empty(
+      force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
 
-  assert (nframes == force_shape[0]);
-  assert (nframes == net_deriv_tensor.shape()[0]);
-  assert (nframes == in_deriv_tensor.shape()[0]);
-  assert (nframes == nlist_tensor.shape()[0]);
-  assert (nall * 3 == force_shape[1]);
-  assert (nloc * ndescrpt == net_deriv_tensor.shape()[1]);
-  assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
-  assert (nloc * nnei == nlist_tensor.shape()[1]);
-  assert (nnei * 4 == ndescrpt);
+  assert(nframes == force_shape[0]);
+  assert(nframes == net_deriv_tensor.shape()[0]);
+  assert(nframes == in_deriv_tensor.shape()[0]);
+  assert(nframes == nlist_tensor.shape()[0]);
+  assert(nall * 3 == force_shape[1]);
+  assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]);
+  assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]);
+  assert(nloc * nnei == nlist_tensor.shape()[1]);
+  assert(nnei * 4 == ndescrpt);
 
   PD_DISPATCH_FLOATING_TYPES(
-    net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] {
-      PdProdForceSeAOpForwardCUDAKernel<data_t>(
-          nloc, nall, nframes, ndescrpt, nnei,
-          force_tensor.mutable_data<data_t>(), net_deriv_tensor.data<data_t>(),
-          in_deriv_tensor.data<data_t>(), nlist_tensor.data<int>());
-  }));
+      net_deriv_tensor.type(), "prod_force_se_a_gpu_forward_kernel", ([&] {
+        ProdForceSeAOpForwardCUDAKernel<data_t>(
+            nloc, nall, nframes, ndescrpt, nnei, force_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
 
   return {force_tensor};
 }
-
-
-std::vector<paddle::Tensor> PdProdForceSeAForward(
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
-    // if(net_deriv_tensor.place() == paddle::PlaceType::kCPU){
-    //     return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
-    // }else if(net_deriv_tensor.place() == paddle::PlaceType::kGPU){
-    return PdProdForceSeAOpCUDAForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel);
-    // }else{
-    //     PD_THROW("No Such kernel for PdFrodForceSeAForward!");
-    // }
-}
-
-std::vector<std::vector<int64_t>> PdProdForceSeAInferShape(
-  std::vector<int64_t> net_deriv_shape,
-  std::vector<int64_t> in_deriv_shape,
-  std::vector<int64_t> nlist_shape,
-  std::vector<int64_t> natoms_shape,
-  const int &n_a_sel,
-  const int &n_r_sel
-) {
-  // int64_t nloc = /*natoms[0]*/ 192;
-  int64_t nall = /*natoms[1]*/ 192;
-  int64_t nframes = net_deriv_shape[0];
-  std::vector<int64_t> force_shape = {nframes, 3 * nall};
-  return {force_shape};
-}
-
-std::vector<paddle::DataType> PdProdForceSeAInferDtype(
-  paddle::DataType net_deriv_dtype,
-  paddle::DataType in_deriv_dtype,
-  paddle::DataType nlist_dtype,
-  paddle::DataType natoms_dtype
-) {
-  return {net_deriv_dtype};
-}
-
-
-PD_BUILD_OP(prod_force_se_a)
-    .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"})
-    .Outputs({"force"})
-    .Attrs({"n_a_sel: int", "n_r_sel: int"})
-    .SetKernelFn(PD_KERNEL(PdProdForceSeAForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(PdProdForceSeAInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(PdProdForceSeAInferDtype));
diff --git a/source/lib/paddle_src/paddle_prod_force_grad.cc b/source/lib/paddle_src/paddle_prod_force_grad.cc
new file mode 100644
index 0000000000..f12c2c425d
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_force_grad.cc
@@ -0,0 +1,141 @@
+#include "paddle/extension.h"
+#include "prod_force_grad.h"
+
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+template <typename data_t>
+void ProdForceSeAOpCPUBackwardKernel(int nloc,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     const data_t* grad,
+                                     const data_t* net_deriv,
+                                     const data_t* in_deriv,
+                                     const int* nlist,
+                                     data_t* grad_net) {
+  // #pragma omp parallel for
+  //   for (int kk = 0; kk < nframes; ++kk){
+  //     int grad_iter	= kk * nloc * 3;
+  //     int in_iter	= kk * nloc * ndescrpt * 3;
+  //     int nlist_iter	= kk * nloc * nnei;
+  //     int grad_net_iter	= kk * nloc * ndescrpt;
+
+  //     deepmd::prod_force_grad_a_cpu(
+  //       &grad_net[grad_net_iter],
+  //       &grad[grad_iter],
+  //       &in_deriv[in_iter],
+  //       &nlist[nlist_iter],
+  //       nloc,
+  //       nnei
+  //     );
+  //   }
+
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* p_grad_net = grad_net + kk * nloc * ndescrpt;
+    const data_t* p_grad = grad + kk * nloc * 3;
+    const data_t* p_in_deriv = in_deriv + kk * nloc * ndescrpt * 3;
+    const int* p_nlist = nlist + kk * nloc * nnei;
+
+    deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc,
+                                  nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCPUBackward(
+    const paddle::Tensor& grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
+
+  CHECK_INPUT_DIM(grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK((nloc * 3) == grad_shape[1],
+           "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, (int64_t)nloc * ndescrpt};
+
+  paddle::Tensor grad_net_tensor =
+      paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] {
+        ProdForceSeAOpCPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
+
+std::vector<paddle::Tensor> ProdForceSeAOpCUDABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdForceSeABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+    return ProdForceSeAOpCUDABackward(force_grad_tensor, net_deriv_tensor,
+                                      in_deriv_tensor, nlist_tensor,
+                                      natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+    return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor,
+                                     in_deriv_tensor, nlist_tensor,
+                                     natoms_tensor, n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("No Such kernel for ProdForceSeABackward.");
+  }
+}
+
+PD_BUILD_GRAD_OP(prod_force_se_a)
+    .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdForceSeABackward));
diff --git a/source/lib/paddle_src/prod_force_grad.cu b/source/lib/paddle_src/paddle_prod_force_grad.cu
similarity index 59%
rename from source/lib/paddle_src/prod_force_grad.cu
rename to source/lib/paddle_src/paddle_prod_force_grad.cu
index a1dad3dc3c..0504f115e6 100644
--- a/source/lib/paddle_src/prod_force_grad.cu
+++ b/source/lib/paddle_src/paddle_prod_force_grad.cu
@@ -1,11 +1,12 @@
-#include "paddle/extension.h"
-
 #include "device.h"
-#include "prod_force_grad.h"
 #include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_force_grad.h"
 
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
 #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename FPTYPE>
 __device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) {
@@ -158,118 +159,100 @@ template void prod_force_grad_r_gpu_cuda<double>(double* grad_net,
 }  // namespace deepmd
 
 template <typename data_t>
-void PdProdForceSeAOpCUDABackwardKernel(
-  int nloc, int nframes, int ndescrpt, int nnei,
-  const data_t* p_grad, const data_t* p_net_deriv, const data_t* p_in_deriv,
-  const int* p_nlist, data_t* p_grad_net
-) {
-    for (int_64 kk = 0; kk < nframes; ++kk) {
-      data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
-      const data_t* grad = p_grad + kk * nloc * 3;
-      const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-      const int* nlist = p_nlist + kk * nloc * nnei;
-      deepmd::prod_force_grad_a_gpu_cuda(
-        grad_net, grad, in_deriv, nlist, nloc, nnei
-      );
-    }
+void ProdForceSeAOpCUDABackwardKernel(int nloc,
+                                      int nframes,
+                                      int ndescrpt,
+                                      int nnei,
+                                      const data_t* p_grad,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const int* p_nlist,
+                                      data_t* p_grad_net) {
+  for (int_64 kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* grad = p_grad + kk * nloc * 3;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_force_grad_a_gpu_cuda(grad_net, grad, in_deriv, nlist, nloc,
+                                       nnei);
+  }
 }
 
+std::vector<paddle::Tensor> ProdForceSeAOpCUDABackward(
+    const paddle::Tensor& force_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  auto grad_shape = force_grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
 
-std::vector<paddle::Tensor> PdProdForceSeAOpCUDABackward(
-  const paddle::Tensor& force_grad_tensor,
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
-    auto grad_shape = force_grad_tensor.shape();
-    auto net_deriv_shape = net_deriv_tensor.shape();
-    auto in_deriv_shape = in_deriv_tensor.shape();
-    auto nlist_shape = nlist_tensor.shape();
-    auto natoms_shape = natoms_tensor.shape();
+  CHECK_INPUT_DIM(force_grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
 
-    CHECK_INPUT_DIM(force_grad_tensor, 2);
-    CHECK_INPUT_DIM(net_deriv_tensor, 2);
-    CHECK_INPUT_DIM(in_deriv_tensor, 2);
-    CHECK_INPUT_DIM(nlist_tensor, 2);
-    CHECK_INPUT_DIM(natoms_tensor, 1);
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
 
-    PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
+  CHECK_INPUT_ON_CPU(natoms_tensor);
+  const int* natoms = natoms_tensor.data<int>();
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
 
-    const int* natoms = nullptr;
-    // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
-    //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
-    // }else{
-    natoms = natoms_tensor.data<int>();
-    // }
-    int nframes = net_deriv_shape[0];
-    int nloc = natoms[0];
-    int ndescrpt = net_deriv_shape[1] / nloc;
-    int nnei = nlist_shape[1] / nloc;
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
 
-    PD_CHECK(nframes == grad_shape[0], "number of frames should match");
-    PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
-    PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
-    PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms");
-    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
-    PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+  std::vector<int64_t> grad_net_shape{nframes, nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape, force_grad_tensor.dtype(), force_grad_tensor.place());
 
-    std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
-    // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
-    paddle::Tensor grad_net_tensor = paddle::empty(
-      grad_net_shape,
-      force_grad_tensor.dtype(),
-      force_grad_tensor.place()
-    );
-
-    // if(force_grad_tensor.place() == paddle::PlaceType::kCPU){
-    //     PD_DISPATCH_FLOATING_TYPES(
-    //     force_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
-    //         PdProdForceSeAOpCPUBackwardKernel<data_t>(
-    //             nloc, nframes, ndescrpt, nnei,
-    //             force_grad_tensor.data<data_t>(),
-    //             net_deriv_tensor.data<data_t>(),
-    //             in_deriv_tensor.data<data_t>(),
-    //             nlist_tensor.data<int>(),
-    //             grad_net_tensor.mutable_data<data_t>());
-    //     }));
-    // }else{
-      PD_DISPATCH_FLOATING_TYPES(
-        force_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] {
-          PdProdForceSeAOpCUDABackwardKernel<data_t>(
-              nloc, nframes, ndescrpt, nnei,
-              force_grad_tensor.data<data_t>(),
-              net_deriv_tensor.data<data_t>(),
-              in_deriv_tensor.data<data_t>(),
-              nlist_tensor.data<int>(),
-              grad_net_tensor.mutable_data<data_t>());
+  PD_DISPATCH_FLOATING_TYPES(
+      force_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] {
+        ProdForceSeAOpCUDABackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, force_grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            nlist_tensor.data<int>(), grad_net_tensor.data<data_t>());
       }));
-    // }
-    return {grad_net_tensor};
+  return {grad_net_tensor};
 }
 
+// std::vector<paddle::Tensor> ProdForceSeABackward(
+//     const paddle::Tensor& force_grad_tensor,
+//     const paddle::Tensor& net_deriv_tensor,
+//     const paddle::Tensor& in_deriv_tensor,
+//     const paddle::Tensor& nlist_tensor,
+//     const paddle::Tensor& natoms_tensor,
+//     int n_a_sel,
+//     int n_r_sel) {
+//   if (net_deriv_tensor.place() == paddle::GPUPlace()) {
+//     return ProdForceSeAOpCUDABackward(force_grad_tensor, net_deriv_tensor,
+//                                       in_deriv_tensor, nlist_tensor,
+//                                       natoms_tensor, n_a_sel, n_r_sel);
+//   }
+//   else if (net_deriv_tensor.place() == paddle::CPUPlace()) {
+//     return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor,
+//                                      in_deriv_tensor, nlist_tensor,
+//                                      natoms_tensor, n_a_sel, n_r_sel);
+//   } else {
+//     PD_THROW("No Such kernel for ProdForceSeABackward.");
+//   }
+// }
 
-std::vector<paddle::Tensor> PdProdForceSeABackward(
-  const paddle::Tensor& force_grad_tensor,
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
-    return PdProdForceSeAOpCUDABackward(
-        force_grad_tensor, net_deriv_tensor, in_deriv_tensor,
-        nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
-    );
-}
-
-
-PD_BUILD_GRAD_OP(prod_force_se_a)
-    .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"})
-    .Outputs({paddle::Grad("net_deriv")})
-    .Attrs({"n_a_sel: int", "n_r_sel: int"})
-    .SetKernelFn(PD_KERNEL(PdProdForceSeABackward));
-
+// PD_BUILD_GRAD_OP(prod_force_se_a)
+//     .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist",
+//     "natoms"}) .Outputs({paddle::Grad("net_deriv")}) .Attrs({"n_a_sel: int",
+//     "n_r_sel: int"}) .SetKernelFn(PD_KERNEL(ProdForceSeABackward));
diff --git a/source/lib/paddle_src/paddle_prod_virial.cc b/source/lib/paddle_src/paddle_prod_virial.cc
new file mode 100644
index 0000000000..299f042694
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial.cc
@@ -0,0 +1,155 @@
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename data_t>
+void ProdVirialSeAOpForwardCPUKernel(int nloc,
+                                     int nall,
+                                     int ndescrpt,
+                                     int nnei,
+                                     int nframes,
+                                     data_t* p_virial,
+                                     data_t* p_atom_virial,
+                                     const data_t* p_net_deriv,
+                                     const data_t* p_in_deriv,
+                                     const data_t* p_rij,
+                                     const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* virial = p_virial + kk * 9;
+    data_t* atom_virial = p_atom_virial + kk * nall * 9;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
+                              nlist, nloc, nall, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCPUForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(rij_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT(natoms_tensor);
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nnei = nlist_tensor.shape()[1] / nloc;
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+  PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1],
+           "dim of rij should be nnei * 3");
+
+  std::vector<int64_t> virial_shape{nframes, 9};
+  std::vector<int64_t> atom_virial_shape{nframes, 9 * nall};
+  paddle::Tensor virial_tensor = paddle::empty(
+      virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+  paddle::Tensor atom_virial_tensor = paddle::empty(
+      atom_virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_virial_se_a_cpu_forward_kernel", ([&] {
+        ProdVirialSeAOpForwardCPUKernel<data_t>(
+            nloc, nall, ndescrpt, nnei, nframes, virial_tensor.data<data_t>(),
+            atom_virial_tensor.data<data_t>(), net_deriv_tensor.data<data_t>(),
+            in_deriv_tensor.data<data_t>(), rij_tensor.data<data_t>(),
+            nlist_tensor.data<int>());
+      }));
+
+  return {virial_tensor, atom_virial_tensor};
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdVirialSeAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (net_deriv_tensor.is_gpu()) {
+    return ProdVirialSeAOpCUDAForward(
+        net_deriv_tensor, in_deriv_tensor, rij_tensor, nlist_tensor,
+        natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (net_deriv_tensor.is_cpu()) {
+    return ProdVirialSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor,
+                                     rij_tensor, nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false),
+                                     n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("Unsupported device type for ProdVirialSeAForward");
+  }
+}
+
+std::vector<std::vector<int64_t>> ProdVirialSeAInferShape(
+    std::vector<int64_t> net_deriv_shape,
+    std::vector<int64_t> in_deriv_shape,
+    std::vector<int64_t> rij_shape,
+    std::vector<int64_t> nlist_shape,
+    std::vector<int64_t> natoms_shape,
+    const int& n_a_sel,
+    const int& n_r_sel) {
+  // int64_t nloc = /*natoms[0]*/ 192;
+  int64_t nall = /*natoms[1]*/ 192;
+  int64_t nframes = net_deriv_shape[0];
+
+  std::vector<int64_t> virial_shape = {nframes, 9};
+  std::vector<int64_t> atom_virial_shape = {nframes, 9 * nall};
+
+  return {virial_shape, atom_virial_shape};
+}
+
+std::vector<paddle::DataType> ProdVirialSeAInferDtype(
+    paddle::DataType net_deriv_dtype,
+    paddle::DataType in_deriv_dtype,
+    paddle::DataType rij_dtype,
+    paddle::DataType nlist_dtype,
+    paddle::DataType natoms_dtype) {
+  return {net_deriv_dtype, net_deriv_dtype};
+}
+
+PD_BUILD_OP(prod_virial_se_a)
+    .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
+    .Outputs({"virial", "atom_virial"})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdVirialSeAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(ProdVirialSeAInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ProdVirialSeAInferDtype));
diff --git a/source/lib/paddle_src/paddle_prod_virial.cu b/source/lib/paddle_src/paddle_prod_virial.cu
new file mode 100644
index 0000000000..23af9bbd75
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial.cu
@@ -0,0 +1,255 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename FPTYPE, int THREADS_PER_BLOCK>
+__global__ void atom_virial_reduction(FPTYPE* virial,
+                                      const FPTYPE* atom_virial,
+                                      const int nall) {
+  unsigned int bid = blockIdx.x;
+  unsigned int tid = threadIdx.x;
+  __shared__ FPTYPE data[THREADS_PER_BLOCK];
+  data[tid] = (FPTYPE)0.;
+  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
+    data[tid] += atom_virial[ii * 9 + bid];
+  }
+  __syncthreads();
+  // do reduction in shared memory
+  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
+    if (tid < ii) {
+      data[tid] += data[tid + ii];
+    }
+    __syncthreads();
+  }
+  // write result for this block to global memory
+  if (tid == 0) virial[bid] = data[0];
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  FPTYPE virial_tmp = (FPTYPE)0.;
+  for (int idw = 0; idw < 4; ++idw) {
+    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
+                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
+  }
+  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
+}
+
+template <typename FPTYPE>
+__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
+                                             FPTYPE* atom_virial,
+                                             const FPTYPE* net_deriv,
+                                             const FPTYPE* in_deriv,
+                                             const FPTYPE* rij,
+                                             const int* nlist,
+                                             const int nloc,
+                                             const int nnei) {
+  // idx -> nloc
+  // idy -> nnei
+  // idz = dd0 * 3 + dd1
+  // dd0 = idz / 3
+  // dd1 = idz % 3
+  const int_64 idx = blockIdx.x;
+  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
+  const unsigned int idz = threadIdx.y;
+  const int ndescrpt = nnei * 1;
+
+  if (idy >= nnei) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  // atomicAdd(
+  //    virial + idz,
+  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
+  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
+  //    3]);
+  atomicAdd(atom_virial + j_idx * 9 + idz,
+            net_deriv[idx * ndescrpt + idy] *
+                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
+                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_a_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template <typename FPTYPE>
+void prod_virial_r_gpu_cuda(FPTYPE* virial,
+                            FPTYPE* atom_virial,
+                            const FPTYPE* net_deriv,
+                            const FPTYPE* in_deriv,
+                            const FPTYPE* rij,
+                            const int* nlist,
+                            const int nloc,
+                            const int nall,
+                            const int nnei) {
+  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
+  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
+
+  const int LEN = 16;
+  int nblock = (nnei + LEN - 1) / LEN;
+  dim3 block_grid(nloc, nblock);
+  dim3 thread_grid(LEN, 9);
+  // compute virial of a frame
+  virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
+      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+  // reduction atom_virial to virial
+  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdVirialSeAOpForwardCUDAKernel(int nloc,
+                                      int nall,
+                                      int ndescrpt,
+                                      int nnei,
+                                      int nframes,
+                                      data_t* p_virial,
+                                      data_t* p_atom_virial,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const data_t* p_rij,
+                                      const int* p_nlist) {
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* virial = p_virial + kk * 9;
+    data_t* atom_virial = p_atom_virial + kk * nall * 9;
+    const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_a_gpu_cuda(virial, atom_virial, net_deriv, in_deriv,
+                                   rij, nlist, nloc, nall, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDAForward(
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT(net_deriv_tensor);
+  CHECK_INPUT(in_deriv_tensor);
+  CHECK_INPUT(rij_tensor);
+  CHECK_INPUT(nlist_tensor);
+  CHECK_INPUT_ON_CPU(natoms_tensor); // TODO:
+  // 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
+
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+  const int* natoms = natoms_tensor.data<int>();
+  int nloc = natoms[0];
+  int nall = natoms[1];
+  int nnei = nlist_tensor.shape()[1] / nloc;
+  int nframes = net_deriv_tensor.shape()[0];
+  int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
+  PD_CHECK(nframes == in_deriv_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
+  PD_CHECK(nframes == nlist_tensor.shape()[0],
+           "number of samples should match");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1],
+           "number of descriptors should match");
+  PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1],
+           "dim of rij should be nnei * 3");
+
+  std::vector<int64_t> virial_shape{nframes, 9};
+  std::vector<int64_t> atom_virial_shape{nframes, 9 * nall};
+  paddle::Tensor virial_tensor =
+      paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
+  paddle::Tensor atom_virial_tensor =
+      paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      net_deriv_tensor.type(), "prod_virial_se_a_cuda_forward_kernel", ([&] {
+        ProdVirialSeAOpForwardCUDAKernel<data_t>(
+            nloc, nall, ndescrpt, nnei, nframes,
+            virial_tensor.mutable_data<data_t>(),
+            atom_virial_tensor.mutable_data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>());
+      }));
+
+  return {virial_tensor, atom_virial_tensor};
+}
diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cc b/source/lib/paddle_src/paddle_prod_virial_grad.cc
new file mode 100644
index 0000000000..7bb442b694
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial_grad.cc
@@ -0,0 +1,147 @@
+#include "paddle/extension.h"
+#include "prod_virial_grad.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename data_t>
+void ProdVirialSeAOpCPUBackwardKernel(int nloc,
+                                      int nframes,
+                                      int ndescrpt,
+                                      int nnei,
+                                      const data_t* p_grad,
+                                      const data_t* p_net_deriv,
+                                      const data_t* p_in_deriv,
+                                      const data_t* p_rij,
+                                      const int* p_nlist,
+                                      data_t* p_grad_net) {
+  // #pragma omp parallel for
+  //   for (int kk = 0; kk < nframes; ++kk) {
+  //     int grad_iter = kk * 9;
+  //     int in_iter = kk * nloc * ndescrpt * 3;
+  //     int rij_iter = kk * nloc * nnei * 3;
+  //     int nlist_iter = kk * nloc * nnei;
+  //     int grad_net_iter = kk * nloc * ndescrpt;
+
+  //     deepmd::prod_virial_grad_a_cpu(&grad_net[grad_net_iter],
+  //     &grad[grad_iter],
+  //                                    &in_deriv[in_iter], &rij[rij_iter],
+  //                                    &nlist[nlist_iter], nloc, nnei);
+  //   }
+
+  for (int kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* grad = p_grad + kk * 9;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist, nloc,
+                                   nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCPUBackward(
+    const paddle::Tensor& grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(rij_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto rij_shape = rij_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+
+  CHECK_INPUT_DIM(grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_tensor.shape()[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  const int* natoms = natoms_tensor.data<int>();
+
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, (int64_t)nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor =
+      paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] {
+        ProdVirialSeAOpCPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+            grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel);
+
+std::vector<paddle::Tensor> ProdVirialSeABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  if (virial_grad_tensor.is_gpu()) {
+    return ProdVirialSeAOpCUDABackward(
+        virial_grad_tensor, net_deriv_tensor, in_deriv_tensor, rij_tensor,
+        nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel);
+  } else if (virial_grad_tensor.is_cpu()) {
+    return ProdVirialSeAOpCPUBackward(virial_grad_tensor, net_deriv_tensor,
+                                      in_deriv_tensor, rij_tensor, nlist_tensor,
+                                      natoms_tensor, n_a_sel, n_r_sel);
+  } else {
+    PD_THROW("Unsupported device type for ProdVirialSeAForward");
+  }
+}
+
+PD_BUILD_GRAD_OP(prod_virial_se_a)
+    .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist",
+             "natoms"})
+    .Outputs({paddle::Grad("net_deriv")})
+    .Attrs({"n_a_sel: int", "n_r_sel: int"})
+    .SetKernelFn(PD_KERNEL(ProdVirialSeABackward));
diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cu b/source/lib/paddle_src/paddle_prod_virial_grad.cu
new file mode 100644
index 0000000000..be0602e30e
--- /dev/null
+++ b/source/lib/paddle_src/paddle_prod_virial_grad.cu
@@ -0,0 +1,191 @@
+#include <cub/block/block_load.cuh>
+#include <cub/block/block_radix_sort.cuh>
+#include <cub/block/block_store.cuh>
+
+#include "device.h"
+#include "gpu_cuda.h"
+#include "paddle/extension.h"
+#include "prod_virial.h"
+
+#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+#define CHECK_INPUT_DIM(x, value) \
+  PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
+#define CHECK_INPUT_READY(x) \
+  PD_CHECK(x.initialized(), #x " must be initialized before usage.")
+
+template <typename FPTYPE>
+__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
+  FPTYPE result = (FPTYPE)0.0;
+  for (int ii = 0; ii < 9; ii++) {
+    result += arr1[ii] * arr2[ii];
+  }
+  return result;
+}
+
+template <typename FPTYPE>
+__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
+                                            const FPTYPE* grad,
+                                            const FPTYPE* env_deriv,
+                                            const FPTYPE* rij,
+                                            const int* nlist,
+                                            const int nloc,
+                                            const int nnei) {
+  // idy -> nnei
+  const unsigned int tid = threadIdx.x;
+  const int_64 idx = blockIdx.x * blockDim.x + tid;
+  const unsigned int idy = blockIdx.y;
+  const unsigned int idw = threadIdx.y;
+  const int ndescrpt = nnei * 4;
+  __shared__ FPTYPE grad_one[9];
+  if (tid < 9) {
+    grad_one[tid] = grad[tid];
+  }
+  __syncthreads();
+  if (idx >= nloc) {
+    return;
+  }
+  int j_idx = nlist[idx * nnei + idy];
+  if (j_idx < 0) {
+    return;
+  }
+  FPTYPE tmp[9];
+  for (int dd0 = 0; dd0 < 3; ++dd0) {
+    for (int dd1 = 0; dd1 < 3; ++dd1) {
+      tmp[dd0 * 3 + dd1] =
+          rij[idx * nnei * 3 + idy * 3 + dd1] *
+          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
+    }
+  }
+  grad_net[idx * ndescrpt + idy * 4 + idw] -=
+      (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
+}
+
+namespace deepmd {
+template <typename FPTYPE>
+void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
+                                 const FPTYPE* grad,
+                                 const FPTYPE* env_deriv,
+                                 const FPTYPE* rij,
+                                 const int* nlist,
+                                 const int nloc,
+                                 const int nnei) {
+  const int ndescrpt = nnei * 4;
+  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
+  const int LEN = 128;
+  const int nblock = (nloc + LEN - 1) / LEN;
+  dim3 block_grid(nblock, nnei);
+  dim3 thread_grid(LEN, 4);
+  virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
+      grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
+  DPErrcheck(cudaGetLastError());
+  DPErrcheck(cudaDeviceSynchronize());
+}
+
+template void prod_virial_grad_a_gpu_cuda<float>(float* grad_net,
+                                                 const float* grad,
+                                                 const float* env_deriv,
+                                                 const float* rij,
+                                                 const int* nlist,
+                                                 const int nloc,
+                                                 const int nnei);
+template void prod_virial_grad_a_gpu_cuda<double>(double* grad_net,
+                                                  const double* grad,
+                                                  const double* env_deriv,
+                                                  const double* rij,
+                                                  const int* nlist,
+                                                  const int nloc,
+                                                  const int nnei);
+}  // namespace deepmd
+
+template <typename data_t>
+void ProdForceSeAOpGPUBackwardKernel(int nloc,
+                                     int nframes,
+                                     int ndescrpt,
+                                     int nnei,
+                                     const data_t* virial_grad,
+                                     const data_t* net_deriv,
+                                     const data_t* in_deriv,
+                                     const data_t* rij,
+                                     const int* nlist,
+                                     data_t* grad_net) {
+  data_t* p_grad_net = grad_net;
+  const data_t* p_grad = virial_grad;
+  const data_t* p_in_deriv = in_deriv;
+  const data_t* p_rij = rij;
+  const int* p_nlist = nlist;
+  for (int_64 kk = 0; kk < nframes; ++kk) {
+    data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
+    const data_t* virial_grad = p_grad + kk * 9;
+    const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
+    const data_t* rij = p_rij + kk * nloc * nnei * 3;
+    const int* nlist = p_nlist + kk * nloc * nnei;
+    deepmd::prod_virial_grad_a_gpu_cuda(grad_net, virial_grad, in_deriv, rij,
+                                        nlist, nloc, nnei);
+  }
+}
+
+std::vector<paddle::Tensor> ProdVirialSeAOpCUDABackward(
+    const paddle::Tensor& virial_grad_tensor,
+    const paddle::Tensor& net_deriv_tensor,
+    const paddle::Tensor& in_deriv_tensor,
+    const paddle::Tensor& rij_tensor,
+    const paddle::Tensor& nlist_tensor,
+    const paddle::Tensor& natoms_tensor,
+    int n_a_sel,
+    int n_r_sel) {
+  CHECK_INPUT_READY(virial_grad_tensor);
+  CHECK_INPUT_READY(net_deriv_tensor);
+  CHECK_INPUT_READY(in_deriv_tensor);
+  CHECK_INPUT_READY(rij_tensor);
+  CHECK_INPUT_READY(nlist_tensor);
+  CHECK_INPUT_READY(natoms_tensor);
+
+  auto grad_shape = virial_grad_tensor.shape();
+  auto net_deriv_shape = net_deriv_tensor.shape();
+  auto in_deriv_shape = in_deriv_tensor.shape();
+  auto rij_shape = rij_tensor.shape();
+  auto nlist_shape = nlist_tensor.shape();
+  auto natoms_shape = natoms_tensor.shape();
+
+  CHECK_INPUT_DIM(virial_grad_tensor, 2);
+  CHECK_INPUT_DIM(net_deriv_tensor, 2);
+  CHECK_INPUT_DIM(in_deriv_tensor, 2);
+  CHECK_INPUT_DIM(rij_tensor, 2);
+  CHECK_INPUT_DIM(nlist_tensor, 2);
+  CHECK_INPUT_DIM(natoms_tensor, 1);
+
+  PD_CHECK(natoms_shape[0] >= 3,
+           "number of atoms should be larger than (or equal to) 3");
+
+  CHECK_INPUT_CPU(natoms_tensor);
+  const int* natoms = natoms_tensor.data<int>();
+  int nframes = net_deriv_shape[0];
+  int nloc = natoms[0];
+  int ndescrpt = net_deriv_shape[1] / nloc;
+  int nnei = nlist_shape[1] / nloc;
+
+  PD_CHECK(nframes == grad_shape[0], "number of frames should match");
+  PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
+  PD_CHECK(nframes == rij_shape[0], "number of frames should match");
+  PD_CHECK(nframes == nlist_shape[0], "number of samples should match");
+  PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
+  PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1],
+           "number of descriptors should match");
+  PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
+  PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
+
+  std::vector<int64_t> grad_net_shape{nframes, nloc * ndescrpt};
+  paddle::Tensor grad_net_tensor = paddle::empty(
+      grad_net_shape, virial_grad_tensor.dtype(), virial_grad_tensor.place());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      virial_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] {
+        ProdForceSeAOpGPUBackwardKernel<data_t>(
+            nloc, nframes, ndescrpt, nnei, virial_grad_tensor.data<data_t>(),
+            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
+            rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
+            grad_net_tensor.data<data_t>());
+      }));
+  return {grad_net_tensor};
+}
diff --git a/source/lib/paddle_src/prod_env_mat.cc b/source/lib/paddle_src/prod_env_mat.cc
deleted file mode 100644
index 7ebfd6cdc7..0000000000
--- a/source/lib/paddle_src/prod_env_mat.cc
+++ /dev/null
@@ -1,321 +0,0 @@
-#include "prod_env_mat.h"
-
-#include <string.h>
-
-#include <cassert>
-#include <iostream>
-
-#include "env_mat.h"
-#include "fmt_nlist.h"
-
-using namespace deepmd;
-
-template <typename FPTYPE>
-void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
-                                FPTYPE *em_deriv,
-                                FPTYPE *rij,
-                                int *nlist,
-                                const FPTYPE *coord,
-                                const int *type,
-                                const InputNlist &inlist,
-                                const int max_nbor_size,
-                                const FPTYPE *avg,
-                                const FPTYPE *std,
-                                const int nloc,
-                                const int nall,
-                                const float rcut,
-                                const float rcut_smth,
-                                const std::vector<int> sec,
-                                const int *f_type) {
-  if (f_type == NULL) {
-    f_type = type;
-  }
-  const int nnei = sec.back();
-  const int nem = nnei * 4;
-
-  // set & normalize coord
-  std::vector<FPTYPE> d_coord3(nall * 3);
-  for (int ii = 0; ii < nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
-    }
-  }
-
-  // set type
-  std::vector<int> d_f_type(nall);
-  for (int ii = 0; ii < nall; ++ii) {
-    d_f_type[ii] = f_type[ii];
-  }
-
-  // build nlist
-  std::vector<std::vector<int> > d_nlist_a(nloc);
-
-  assert(nloc == inlist.inum);
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a[ii].reserve(max_nbor_size);
-  }
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = inlist.ilist[ii];
-    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
-      int j_idx = inlist.firstneigh[ii][jj];
-      d_nlist_a[i_idx].push_back(j_idx);
-    }
-  }
-
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    std::vector<int> fmt_nlist_a;
-    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii,
-                                 d_nlist_a[ii], rcut, sec);
-    std::vector<FPTYPE> d_em_a;
-    std::vector<FPTYPE> d_em_a_deriv;
-    std::vector<FPTYPE> d_em_r;
-    std::vector<FPTYPE> d_em_r_deriv;
-    std::vector<FPTYPE> d_rij_a;
-    env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii,
-                  fmt_nlist_a, sec, rcut_smth, rcut);
-
-    // check sizes
-    assert(d_em_a.size() == nem);
-    assert(d_em_a_deriv.size() == nem * 3);
-    assert(d_rij_a.size() == nnei * 3);
-    assert(fmt_nlist_a.size() == nnei);
-    // record outputs
-    for (int jj = 0; jj < nem; ++jj) {
-      if (type[ii] >= 0) {
-        em[ii * nem + jj] =
-            (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj];
-      } else {
-        em[ii * nem + jj] = 0;
-      }
-    }
-    for (int jj = 0; jj < nem * 3; ++jj) {
-      if (type[ii] >= 0) {
-        em_deriv[ii * nem * 3 + jj] =
-            d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3];
-      } else {
-        em_deriv[ii * nem * 3 + jj] = 0;
-      }
-    }
-    for (int jj = 0; jj < nnei * 3; ++jj) {
-      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
-    }
-    for (int jj = 0; jj < nnei; ++jj) {
-      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
-    }
-  }
-}
-
-template <typename FPTYPE>
-void deepmd::prod_env_mat_r_cpu(FPTYPE *em,
-                                FPTYPE *em_deriv,
-                                FPTYPE *rij,
-                                int *nlist,
-                                const FPTYPE *coord,
-                                const int *type,
-                                const InputNlist &inlist,
-                                const int max_nbor_size,
-                                const FPTYPE *avg,
-                                const FPTYPE *std,
-                                const int nloc,
-                                const int nall,
-                                const float rcut,
-                                const float rcut_smth,
-                                const std::vector<int> sec) {
-  const int nnei = sec.back();
-  const int nem = nnei * 1;
-
-  // set & normalize coord
-  std::vector<FPTYPE> d_coord3(nall * 3);
-  for (int ii = 0; ii < nall; ++ii) {
-    for (int dd = 0; dd < 3; ++dd) {
-      d_coord3[ii * 3 + dd] = coord[ii * 3 + dd];
-    }
-  }
-
-  // set type
-  std::vector<int> d_type(nall);
-  for (int ii = 0; ii < nall; ++ii) {
-    d_type[ii] = type[ii];
-  }
-
-  // build nlist
-  std::vector<std::vector<int> > d_nlist_a(nloc);
-
-  assert(nloc == inlist.inum);
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    d_nlist_a[ii].reserve(max_nbor_size);
-  }
-  for (unsigned ii = 0; ii < nloc; ++ii) {
-    int i_idx = inlist.ilist[ii];
-    for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) {
-      int j_idx = inlist.firstneigh[ii][jj];
-      d_nlist_a[i_idx].push_back(j_idx);
-    }
-  }
-
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    std::vector<int> fmt_nlist_a;
-    int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii,
-                                 d_nlist_a[ii], rcut, sec);
-    std::vector<FPTYPE> d_em_a;
-    std::vector<FPTYPE> d_em_a_deriv;
-    std::vector<FPTYPE> d_em_r;
-    std::vector<FPTYPE> d_em_r_deriv;
-    std::vector<FPTYPE> d_rij_a;
-    env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii,
-                  fmt_nlist_a, sec, rcut_smth, rcut);
-
-    // check sizes
-    assert(d_em_a.size() == nem);
-    assert(d_em_a_deriv.size() == nem * 3);
-    assert(d_rij_a.size() == nnei * 3);
-    assert(fmt_nlist_a.size() == nnei);
-    // record outputs
-    for (int jj = 0; jj < nem; ++jj) {
-      em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) /
-                          std[d_type[ii] * nem + jj];
-    }
-    for (int jj = 0; jj < nem * 3; ++jj) {
-      em_deriv[ii * nem * 3 + jj] =
-          d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3];
-    }
-    for (int jj = 0; jj < nnei * 3; ++jj) {
-      rij[ii * nnei * 3 + jj] = d_rij_a[jj];
-    }
-    for (int jj = 0; jj < nnei; ++jj) {
-      nlist[ii * nnei + jj] = fmt_nlist_a[jj];
-    }
-  }
-}
-
-template void deepmd::prod_env_mat_a_cpu<double>(double *em,
-                                                 double *em_deriv,
-                                                 double *rij,
-                                                 int *nlist,
-                                                 const double *coord,
-                                                 const int *type,
-                                                 const InputNlist &inlist,
-                                                 const int max_nbor_size,
-                                                 const double *avg,
-                                                 const double *std,
-                                                 const int nloc,
-                                                 const int nall,
-                                                 const float rcut,
-                                                 const float rcut_smth,
-                                                 const std::vector<int> sec,
-                                                 const int *f_type);
-
-template void deepmd::prod_env_mat_a_cpu<float>(float *em,
-                                                float *em_deriv,
-                                                float *rij,
-                                                int *nlist,
-                                                const float *coord,
-                                                const int *type,
-                                                const InputNlist &inlist,
-                                                const int max_nbor_size,
-                                                const float *avg,
-                                                const float *std,
-                                                const int nloc,
-                                                const int nall,
-                                                const float rcut,
-                                                const float rcut_smth,
-                                                const std::vector<int> sec,
-                                                const int *f_type);
-
-template void deepmd::prod_env_mat_r_cpu<double>(double *em,
-                                                 double *em_deriv,
-                                                 double *rij,
-                                                 int *nlist,
-                                                 const double *coord,
-                                                 const int *type,
-                                                 const InputNlist &inlist,
-                                                 const int max_nbor_size,
-                                                 const double *avg,
-                                                 const double *std,
-                                                 const int nloc,
-                                                 const int nall,
-                                                 const float rcut,
-                                                 const float rcut_smth,
-                                                 const std::vector<int> sec);
-
-template void deepmd::prod_env_mat_r_cpu<float>(float *em,
-                                                float *em_deriv,
-                                                float *rij,
-                                                int *nlist,
-                                                const float *coord,
-                                                const int *type,
-                                                const InputNlist &inlist,
-                                                const int max_nbor_size,
-                                                const float *avg,
-                                                const float *std,
-                                                const int nloc,
-                                                const int nall,
-                                                const float rcut,
-                                                const float rcut_smth,
-                                                const std::vector<int> sec);
-
-// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-void deepmd::env_mat_nbor_update(InputNlist &inlist,
-                                 InputNlist &gpu_inlist,
-                                 int &max_nbor_size,
-                                 int *&nbor_list_dev,
-                                 const int *mesh,
-                                 const int size) {
-  int *mesh_host = new int[size];
-  memcpy_device_to_host(mesh, mesh_host, size);
-  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
-  memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
-  memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
-  const int ago = mesh_host[0];
-  if (ago == 0 || gpu_inlist.inum < inlist.inum) {
-    const int inum = inlist.inum;
-    if (gpu_inlist.inum < inum) {
-      delete_device_memory(gpu_inlist.ilist);
-      delete_device_memory(gpu_inlist.numneigh);
-      delete_device_memory(gpu_inlist.firstneigh);
-      malloc_device_memory(gpu_inlist.ilist, inum);
-      malloc_device_memory(gpu_inlist.numneigh, inum);
-      malloc_device_memory(gpu_inlist.firstneigh, inum);
-    }
-    memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
-    memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
-    int _max_nbor_size = max_numneigh(inlist);
-    if (_max_nbor_size <= 256) {
-      _max_nbor_size = 256;
-    } else if (_max_nbor_size <= 512) {
-      _max_nbor_size = 512;
-    } else if (_max_nbor_size <= 1024) {
-      _max_nbor_size = 1024;
-    } else if (_max_nbor_size <= 2048) {
-      _max_nbor_size = 2048;
-    } else {
-      _max_nbor_size = 4096;
-    }
-    if (nbor_list_dev == NULL || _max_nbor_size > max_nbor_size ||
-        inum > gpu_inlist.inum) {
-      delete_device_memory(nbor_list_dev);
-      malloc_device_memory(nbor_list_dev, inum * _max_nbor_size);
-    }
-    // update info
-    gpu_inlist.inum = inum;
-    max_nbor_size = _max_nbor_size;
-
-    // copy nbor list from host to the device
-    std::vector<int> nbor_list_host(inum * max_nbor_size, 0);
-    int **_firstneigh = (int **)malloc(sizeof(int *) * inum);
-    for (int ii = 0; ii < inum; ii++) {
-      _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
-      for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
-        nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj];
-      }
-    }
-    memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0],
-                          inum * max_nbor_size);
-    memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum);
-    free(_firstneigh);
-  }
-  delete[] mesh_host;
-}
-// #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/source/lib/paddle_src/prod_env_mat.h b/source/lib/paddle_src/prod_env_mat.h
deleted file mode 100644
index 3052dd2230..0000000000
--- a/source/lib/paddle_src/prod_env_mat.h
+++ /dev/null
@@ -1,140 +0,0 @@
-#pragma once
-#include <vector>
-
-#include "device.h"
-#include "neighbor_list.h"
-
-namespace deepmd {
-
-template <typename FPTYPE>
-void prod_env_mat_a_cpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &inlist,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec,
-                        const int *f_type = NULL);
-
-template <typename FPTYPE>
-void prod_env_mat_r_cpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &inlist,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec);
-
-#if GOOGLE_CUDA
-template <typename FPTYPE>
-void prod_env_mat_a_gpu_cuda(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int *f_type = NULL);
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu_cuda(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec);
-
-void env_mat_nbor_update(InputNlist &inlist,
-                         InputNlist &gpu_inlist,
-                         int &max_nbor_size,
-                         int *&nbor_list_dev,
-                         const int *mesh,
-                         const int size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_env_mat_a_gpu_rocm(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec,
-                             const int *f_type = NULL);
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu_rocm(FPTYPE *em,
-                             FPTYPE *em_deriv,
-                             FPTYPE *rij,
-                             int *nlist,
-                             const FPTYPE *coord,
-                             const int *type,
-                             const InputNlist &gpu_inlist,
-                             int *array_int,
-                             unsigned long long *array_longlong,
-                             const int max_nbor_size,
-                             const FPTYPE *avg,
-                             const FPTYPE *std,
-                             const int nloc,
-                             const int nall,
-                             const float rcut,
-                             const float rcut_smth,
-                             const std::vector<int> sec);
-
-void env_mat_nbor_update(InputNlist &inlist,
-                         InputNlist &gpu_inlist,
-                         int &max_nbor_size,
-                         int *&nbor_list_dev,
-                         const int *mesh,
-                         const int size);
-#endif  // TENSORFLOW_USE_ROCM
-
-}  // namespace deepmd
diff --git a/source/lib/paddle_src/prod_virial.cc b/source/lib/paddle_src/prod_virial.cc
deleted file mode 100644
index 8769ccf8f1..0000000000
--- a/source/lib/paddle_src/prod_virial.cc
+++ /dev/null
@@ -1,219 +0,0 @@
-#include "custom_op.h"
-
-REGISTER_OP("ProdVirial")
-    .Attr("T: {float, double} = DT_DOUBLE")
-    .Input("net_deriv: T")
-    .Input("in_deriv: T")
-    .Input("rij: T")
-    .Input("nlist: int32")
-    .Input("axis: int32")
-    .Input("natoms: int32")
-    .Attr("n_a_sel: int")
-    .Attr("n_r_sel: int")
-    .Output("virial: T")
-    .Output("atom_virial: T");
-
-using namespace tensorflow;
-
-using CPUDevice = Eigen::ThreadPoolDevice;
-
-template <typename Device, typename FPTYPE>
-class ProdVirialOp : public OpKernel {
- public:
-  explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel));
-    OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel));
-    n_a_shift = n_a_sel * 4;
-  }
-
-  void Compute(OpKernelContext* context) override {
-    deepmd::safe_compute(
-        context, [this](OpKernelContext* context) { this->_Compute(context); });
-  }
-
-  void _Compute(OpKernelContext* context) {
-    // Grab the input tensor
-    const Tensor& net_deriv_tensor = context->input(0);
-    const Tensor& in_deriv_tensor = context->input(1);
-    const Tensor& rij_tensor = context->input(2);
-    const Tensor& nlist_tensor = context->input(3);
-    const Tensor& axis_tensor = context->input(4);
-    const Tensor& natoms_tensor = context->input(5);
-
-    // set size of the sample
-    OP_REQUIRES(context, (net_deriv_tensor.shape().dims() == 2),
-                errors::InvalidArgument("Dim of net deriv should be 2"));
-    OP_REQUIRES(context, (in_deriv_tensor.shape().dims() == 2),
-                errors::InvalidArgument("Dim of input deriv should be 2"));
-    OP_REQUIRES(context, (rij_tensor.shape().dims() == 2),
-                errors::InvalidArgument("Dim of rij should be 2"));
-    OP_REQUIRES(context, (nlist_tensor.shape().dims() == 2),
-                errors::InvalidArgument("Dim of nlist should be 2"));
-    OP_REQUIRES(context, (axis_tensor.shape().dims() == 2),
-                errors::InvalidArgument("Dim of axis should be 2"));
-    OP_REQUIRES(context, (natoms_tensor.shape().dims() == 1),
-                errors::InvalidArgument("Dim of natoms should be 1"));
-
-    OP_REQUIRES(context, (natoms_tensor.shape().dim_size(0) >= 3),
-                errors::InvalidArgument(
-                    "number of atoms should be larger than (or equal to) 3"));
-    auto natoms = natoms_tensor.flat<int>();
-
-    int nframes = net_deriv_tensor.shape().dim_size(0);
-    int nloc = natoms(0);
-    int nall = natoms(1);
-    int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc;
-    int nnei = nlist_tensor.shape().dim_size(1) / nloc;
-
-    // check the sizes
-    OP_REQUIRES(context, (nframes == in_deriv_tensor.shape().dim_size(0)),
-                errors::InvalidArgument("number of samples should match"));
-    OP_REQUIRES(context, (nframes == rij_tensor.shape().dim_size(0)),
-                errors::InvalidArgument("number of samples should match"));
-    OP_REQUIRES(context, (nframes == nlist_tensor.shape().dim_size(0)),
-                errors::InvalidArgument("number of samples should match"));
-    OP_REQUIRES(context, (nframes == axis_tensor.shape().dim_size(0)),
-                errors::InvalidArgument("number of samples should match"));
-
-    OP_REQUIRES(context,
-                (nloc * ndescrpt * 12 == in_deriv_tensor.shape().dim_size(1)),
-                errors::InvalidArgument("number of descriptors should match"));
-    OP_REQUIRES(context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)),
-                errors::InvalidArgument("dim of rij should be nnei * 3"));
-    OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel),
-                errors::InvalidArgument("number of neighbors should match"));
-    OP_REQUIRES(
-        context, (nloc * 4 == axis_tensor.shape().dim_size(1)),
-        errors::InvalidArgument("number of axis type+id should be 2+2"));
-
-    // Create an output tensor
-    TensorShape virial_shape;
-    virial_shape.AddDim(nframes);
-    virial_shape.AddDim(9);
-    Tensor* virial_tensor = NULL;
-    OP_REQUIRES_OK(context,
-                   context->allocate_output(0, virial_shape, &virial_tensor));
-    TensorShape atom_virial_shape;
-    atom_virial_shape.AddDim(nframes);
-    atom_virial_shape.AddDim(9 * nall);
-    Tensor* atom_virial_tensor = NULL;
-    OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape,
-                                                     &atom_virial_tensor));
-
-    // flat the tensors
-    auto net_deriv = net_deriv_tensor.flat<FPTYPE>();
-    auto in_deriv = in_deriv_tensor.flat<FPTYPE>();
-    auto rij = rij_tensor.flat<FPTYPE>();
-    auto nlist = nlist_tensor.flat<int>();
-    auto axis = axis_tensor.flat<int>();
-    auto virial = virial_tensor->flat<FPTYPE>();
-    auto atom_virial = atom_virial_tensor->flat<FPTYPE>();
-
-    // loop over samples
-#pragma omp parallel for
-    for (int kk = 0; kk < nframes; ++kk) {
-      int net_iter = kk * nloc * ndescrpt;
-      int in_iter = kk * nloc * ndescrpt * 12;
-      int rij_iter = kk * nloc * nnei * 3;
-      int nlist_iter = kk * nloc * nnei;
-      int axis_iter = kk * nloc * 4;
-      int virial_iter = kk * 9;
-      int atom_virial_iter = kk * nall * 9;
-
-      for (int ii = 0; ii < 9; ++ii) {
-        virial(virial_iter + ii) = 0.;
-      }
-      for (int ii = 0; ii < 9 * nall; ++ii) {
-        atom_virial(atom_virial_iter + ii) = 0.;
-      }
-
-      // compute virial of a frame
-      for (int ii = 0; ii < nloc; ++ii) {
-        int i_idx = ii;
-
-        // set axes
-        int axis0_type = axis(axis_iter + i_idx * 4 + 0);
-        int axis1_type = axis(axis_iter + i_idx * 4 + 2);
-        int axis_0 = axis(axis_iter + i_idx * 4 + 1);
-        int axis_1 = axis(axis_iter + i_idx * 4 + 3);
-        if (axis0_type == 1) axis_0 += n_a_sel;
-        if (axis1_type == 1) axis_1 += n_a_sel;
-
-        // deriv wrt neighbors
-        for (int jj = 0; jj < nnei; ++jj) {
-          int j_idx = nlist(nlist_iter + i_idx * nnei + jj);
-          if (j_idx < 0) continue;
-          if (jj == axis_0) {
-            for (int aa = 0; aa < ndescrpt; ++aa) {
-              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
-              for (int dd0 = 0; dd0 < 3; ++dd0) {
-                for (int dd1 = 0; dd1 < 3; ++dd1) {
-                  FPTYPE tmp_v =
-                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
-                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 +
-                               dd0);
-                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
-                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
-                      tmp_v;
-                }
-              }
-            }
-          } else if (jj == axis_1) {
-            for (int aa = 0; aa < ndescrpt; ++aa) {
-              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
-              for (int dd0 = 0; dd0 < 3; ++dd0) {
-                for (int dd1 = 0; dd1 < 3; ++dd1) {
-                  FPTYPE tmp_v =
-                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
-                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 +
-                               dd0);
-                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
-                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
-                      tmp_v;
-                }
-              }
-            }
-          } else {
-            int aa_start, aa_end;
-            make_descript_range(aa_start, aa_end, jj);
-            for (int aa = aa_start; aa < aa_end; ++aa) {
-              FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa);
-              for (int dd0 = 0; dd0 < 3; ++dd0) {
-                for (int dd1 = 0; dd1 < 3; ++dd1) {
-                  FPTYPE tmp_v =
-                      pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) *
-                      in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 +
-                               dd0);
-                  virial(virial_iter + dd0 * 3 + dd1) += tmp_v;
-                  atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) +=
-                      tmp_v;
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
- private:
-  int n_r_sel, n_a_sel, n_a_shift;
-  inline void make_descript_range(int& idx_start,
-                                  int& idx_end,
-                                  const int& nei_idx) {
-    if (nei_idx < n_a_sel) {
-      idx_start = nei_idx * 4;
-      idx_end = nei_idx * 4 + 4;
-    } else {
-      idx_start = n_a_shift + (nei_idx - n_a_sel);
-      idx_end = n_a_shift + (nei_idx - n_a_sel) + 1;
-    }
-  }
-};
-
-#define REGISTER_CPU(T)                                             \
-  REGISTER_KERNEL_BUILDER(                                          \
-      Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      ProdVirialOp<CPUDevice, T>);
-REGISTER_CPU(float);
-REGISTER_CPU(double);
diff --git a/source/lib/paddle_src/prod_virial.cu b/source/lib/paddle_src/prod_virial.cu
deleted file mode 100644
index fe7abee63b..0000000000
--- a/source/lib/paddle_src/prod_virial.cu
+++ /dev/null
@@ -1,496 +0,0 @@
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/block/block_store.cuh>
-#include "paddle/extension.h"
-
-#include "device.h"
-#include "prod_virial.h"
-#include "gpu_cuda.h"
-
-#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
-#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
-
-template <typename FPTYPE, int THREADS_PER_BLOCK>
-__global__ void atom_virial_reduction(FPTYPE* virial,
-                                      const FPTYPE* atom_virial,
-                                      const int nall) {
-  unsigned int bid = blockIdx.x;
-  unsigned int tid = threadIdx.x;
-  __shared__ FPTYPE data[THREADS_PER_BLOCK];
-  data[tid] = (FPTYPE)0.;
-  for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
-    data[tid] += atom_virial[ii * 9 + bid];
-  }
-  __syncthreads();
-  // do reduction in shared memory
-  for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-    if (tid < ii) {
-      data[tid] += data[tid + ii];
-    }
-    __syncthreads();
-  }
-  // write result for this block to global memory
-  if (tid == 0) virial[bid] = data[0];
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  // atomicAdd(
-  //    virial + idz,
-  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-  //    3]);
-  FPTYPE virial_tmp = (FPTYPE)0.;
-  for (int idw = 0; idw < 4; ++idw) {
-    virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-                  rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                  in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
-  }
-  atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
-}
-
-template <typename FPTYPE>
-__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
-                                             FPTYPE* atom_virial,
-                                             const FPTYPE* net_deriv,
-                                             const FPTYPE* in_deriv,
-                                             const FPTYPE* rij,
-                                             const int* nlist,
-                                             const int nloc,
-                                             const int nnei) {
-  // idx -> nloc
-  // idy -> nnei
-  // idz = dd0 * 3 + dd1
-  // dd0 = idz / 3
-  // dd1 = idz % 3
-  const int_64 idx = blockIdx.x;
-  const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-  const unsigned int idz = threadIdx.y;
-  const int ndescrpt = nnei * 1;
-
-  if (idy >= nnei) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  // atomicAdd(
-  //    virial + idz,
-  //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-  //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-  //    3]);
-  atomicAdd(atom_virial + j_idx * 9 + idz,
-            net_deriv[idx * ndescrpt + idy] *
-                rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-                in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
-}
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_a_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
-      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  // reduction atom_virial to virial
-  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-}
-
-template <typename FPTYPE>
-void prod_virial_r_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* in_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei) {
-  DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-  DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-  const int LEN = 16;
-  int nblock = (nnei + LEN - 1) / LEN;
-  dim3 block_grid(nloc, nblock);
-  dim3 thread_grid(LEN, 9);
-  // compute virial of a frame
-  virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
-      virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-  // reduction atom_virial to virial
-  atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-}
-}  // namespace deepmd
-
-template <typename data_t>
-void PdProdVirialSeAOpForwardCUDAKernel(
-    int nloc, int nall, int ndescrpt, int nnei, int nframes,
-    data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv,
-    const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){
-
-    for(int kk = 0; kk < nframes; ++kk){
-      data_t * virial = p_virial + kk * 9;
-      data_t * atom_virial = p_atom_virial + kk * nall * 9;
-      const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
-      const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-      const data_t * rij = p_rij + kk * nloc * nnei * 3;
-      const int * nlist = p_nlist + kk * nloc * nnei;
-      deepmd::prod_virial_a_gpu_cuda(
-          virial, atom_virial,
-          net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
-    }
-}
-
-
-std::vector<paddle::Tensor> PdProdVirialSeAOpCUDAForward(
-const paddle::Tensor& net_deriv_tensor,
-const paddle::Tensor& in_deriv_tensor,
-const paddle::Tensor& rij_tensor,
-const paddle::Tensor& nlist_tensor,
-const paddle::Tensor& natoms_tensor,
-int n_a_sel,
-int n_r_sel
-){
-    CHECK_INPUT(net_deriv_tensor);
-    CHECK_INPUT(in_deriv_tensor);
-    CHECK_INPUT(rij_tensor);
-    CHECK_INPUT(nlist_tensor);
-    // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
-
-    CHECK_INPUT_DIM(net_deriv_tensor, 2);
-    CHECK_INPUT_DIM(in_deriv_tensor, 2);
-    CHECK_INPUT_DIM(rij_tensor, 2);
-    CHECK_INPUT_DIM(nlist_tensor, 2);
-    CHECK_INPUT_DIM(natoms_tensor, 1);
-
-    PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
-    const int* natoms = natoms_tensor.data<int>();
-    // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel());
-    int nloc = natoms[0];
-    // printf("nloc = %d\n", nloc);
-    int nall = natoms[1];
-    // printf("nall = %d\n", nall);
-    int nnei = nlist_tensor.shape()[1] / nloc;
-    int nframes = net_deriv_tensor.shape()[0];
-    int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
-    PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
-    PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
-    PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
-    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
-    PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3");
-
-    std::vector<int64_t> virial_shape {nframes, 9};
-    std::vector<int64_t> atom_virial_shape {nframes, 9 * nall};
-    paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
-    paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
-
-    PD_DISPATCH_FLOATING_TYPES(
-      net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] {
-        PdProdVirialSeAOpForwardCUDAKernel<data_t>(
-            nloc, nall, ndescrpt, nnei, nframes,
-            virial_tensor.mutable_data<data_t>(), atom_virial_tensor.mutable_data<data_t>(),
-            net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
-            rij_tensor.data<data_t>(), nlist_tensor.data<int>());
-      }));
-
-    return {virial_tensor, atom_virial_tensor};
-}
-
-std::vector<paddle::Tensor> PdProdVirialSeAForward(
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& rij_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
-  if (net_deriv_tensor.is_gpu()) {
-    // std::cout << natoms_tensor.dtype() << std::endl;
-    // const int* natoms = natoms_tensor.data<int>();
-    // printf("%d\n", natoms[0]);
-    return PdProdVirialSeAOpCUDAForward(
-      net_deriv_tensor,
-      in_deriv_tensor,
-      rij_tensor,
-      nlist_tensor,
-      natoms_tensor,
-      n_a_sel,
-      n_r_sel
-    );
-  } else {
-    PD_THROW("Unsupported device type for forward function of custom relu operator.");
-  }
-}
-
-
-/*以下是反向代码*/
-
-// template <typename data_t>
-// void PdProdForceSeAOpCPUBackwardKernel(
-//     int nloc, int nframes, int ndescrpt, int nnei,
-//     const data_t* grad, const data_t* net_deriv,
-//     const data_t* in_deriv, const data_t* rij, const int* nlist,
-//     data_t* grad_net){
-
-// #pragma omp parallel for
-//     for (int kk = 0; kk < nframes; ++kk){
-
-//       int grad_iter	= kk * 9;
-//       int in_iter	= kk * nloc * ndescrpt * 3;
-//       int rij_iter	= kk * nloc * nnei * 3;
-//       int nlist_iter	= kk * nloc * nnei;
-//       int grad_net_iter	= kk * nloc * ndescrpt;
-//       // int n_a_shift = n_a_sel * 4;
-
-//       deepmd::prod_virial_grad_a_cpu(
-//         &grad_net[grad_net_iter],
-//         &grad[grad_iter],
-//         &in_deriv[in_iter],
-//         &rij[rij_iter],
-//         &nlist[nlist_iter],
-//         nloc,
-//         nnei);
-//     }
-// }
-
-
-// template <typename data_t>
-// void PdProdForceSeAOpGPUBackwardKernel(
-//   int nloc, int nframes, int ndescrpt, int nnei,
-//   const data_t* virial_grad, const data_t* net_deriv,
-//   const data_t* in_deriv, const data_t* rij, const int* nlist,
-//   data_t* grad_net
-// ) {
-//     for (int_64 kk = 0; kk < nframes; ++kk) {
-//       FPTYPE* grad_net = p_grad_net + kk * nloc * ndescrpt;
-//       const FPTYPE* virial_grad = p_grad + kk * 9;
-//       const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-//       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
-//       const int* nlist = p_nlist + kk * nloc * nnei;
-//       if (device == "GPU") {
-//         deepmd::prod_virial_grad_a_gpu_cuda(
-//           grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei
-//         );
-//       }
-//     }
-// }
-
-
-// std::vector<paddle::Tensor> PdProdVirialSeAOpCPUBackward(
-// const paddle::Tensor& virial_grad_tensor,
-// const paddle::Tensor& net_deriv_tensor,
-// const paddle::Tensor& in_deriv_tensor,
-// const paddle::Tensor& rij_tensor,
-// const paddle::Tensor& nlist_tensor,
-// const paddle::Tensor& natoms_tensor,
-// int n_a_sel,
-// int n_r_sel
-// ){
-//     // CHECK_INPUT_READY(virial_grad_tensor);
-//     // CHECK_INPUT_READY(net_deriv_tensor);
-//     // CHECK_INPUT_READY(in_deriv_tensor);
-//     // CHECK_INPUT_READY(rij_tensor);
-//     // CHECK_INPUT_READY(nlist_tensor);
-//     // CHECK_INPUT_READY(natoms_tensor);
-
-//     auto grad_shape = virial_grad_tensor.shape();
-//     auto net_deriv_shape = net_deriv_tensor.shape();
-//     auto in_deriv_shape = in_deriv_tensor.shape();
-//     auto rij_shape = rij_tensor.shape();
-//     auto nlist_shape = nlist_tensor.shape();
-//     auto natoms_shape = natoms_tensor.shape();
-
-//     CHECK_INPUT_DIM(virial_grad_tensor, 2);
-//     CHECK_INPUT_DIM(net_deriv_tensor, 2);
-//     CHECK_INPUT_DIM(in_deriv_tensor, 2);
-//     CHECK_INPUT_DIM(rij_tensor, 2);
-//     CHECK_INPUT_DIM(nlist_tensor, 2);
-//     CHECK_INPUT_DIM(natoms_tensor, 1);
-
-//     PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
-
-//     const int* natoms = nullptr;
-//     // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
-//     //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
-//     // }else{
-//     natoms = natoms_tensor.data<int>();
-//     // }
-//     int nframes = net_deriv_shape[0];
-//     int nloc = natoms[0];
-//     int ndescrpt = net_deriv_shape[1] / nloc;
-//     int nnei = nlist_shape[1] / nloc;
-
-//     PD_CHECK(nframes == grad_shape[0], "number of frames should match");
-//     PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
-//     PD_CHECK(nframes == rij_shape[0], "number of frames should match");
-//     PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
-//     PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
-//     PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
-//     PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
-//     PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
-
-//     std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
-//     // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
-//     paddle::Tensor grad_net_tensor = paddle::empty(
-//       grad_net_shape,
-//       virial_grad_tensor.dtype(),
-//       virial_grad_tensor.place()
-//     );
-
-//     if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){
-//         PD_DISPATCH_FLOATING_TYPES(
-//         virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
-//             PdProdForceSeAOpCPUBackwardKernel<data_t>(
-//                 nloc, nframes, ndescrpt, nnei,
-//                 virial_grad_tensor.data<data_t>(),
-//                 net_deriv_tensor.data<data_t>(),
-//                 in_deriv_tensor.data<data_t>(),
-//                 rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
-//                 grad_net_tensor.mutable_data<data_t>());
-//         }));
-//     }else{
-//         PD_DISPATCH_FLOATING_TYPES(
-//         virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
-//             PdProdForceSeAOpGPUBackwardKernel<data_t>(
-//                 nloc, nframes, ndescrpt, nnei,
-//                 virial_grad_tensor.data<data_t>(),
-//                 net_deriv_tensor.data<data_t>(),
-//                 in_deriv_tensor.data<data_t>(),
-//                 rij_tensor.data<data_t>(),
-//                 nlist_tensor.data<int>(),
-//                 grad_net_tensor.mutable_data<data_t>());
-//         }));
-//     }
-//     return {grad_net_tensor};
-// }
-
-
-// std::vector<paddle::Tensor> PdProdVirialSeABackward(
-//   const paddle::Tensor& virial_grad_tensor,
-//   const paddle::Tensor& net_deriv_tensor,
-//   const paddle::Tensor& in_deriv_tensor,
-//   const paddle::Tensor& rij_tensor,
-//   const paddle::Tensor& nlist_tensor,
-//   const paddle::Tensor& natoms_tensor,
-//   int n_a_sel,
-//   int n_r_sel
-// ) {
-//     return PdProdVirialSeAOpCPUBackward(
-//         virial_grad_tensor, net_deriv_tensor, in_deriv_tensor,
-//         rij_tensor,
-//         nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
-//     );
-// }
-
-// std::vector<std::vector<int64_t>> PdProdVirialSeAInferShape(
-//   // std::vector<int64_t> x_shape
-//   std::vector<int64_t> net_deriv_shape,
-//   std::vector<int64_t> in_deriv_shape,
-//   std::vector<int64_t> rij_shape,
-//   std::vector<int64_t> nlist_shape,
-//   std::vector<int64_t> natoms_shape,
-//   int n_a_sel,
-//   int n_r_sel
-// ) {
-//   auto virial_shape {nframes, 9};
-//   auto atom_virial_shape {nframes, 9 * nall};
-//   return {virial_shape, atom_virial_shape};
-// }
-
-// std::vector<paddle::DataType> PdProdVirialSeAInferDtype(paddle::DataType x_dtype, ...) {
-//   return {x_dtype, ...};
-// }
-
-std::vector<std::vector<int64_t>> PdProdVirialSeAInferShape(
-  std::vector<int64_t> net_deriv_shape,
-  std::vector<int64_t> in_deriv_shape,
-  std::vector<int64_t> rij_shape,
-  std::vector<int64_t> nlist_shape,
-  std::vector<int64_t> natoms_shape,
-  const int &n_a_sel,
-  const int &n_r_sel
-) {
-  // int64_t nloc = /*natoms[0]*/ 192;
-  int64_t nall = /*natoms[1]*/ 192;
-  int64_t nframes = net_deriv_shape[0];
-
-  std::vector<int64_t> virial_shape = {nframes, 9};
-  std::vector<int64_t> atom_virial_shape = {nframes, 9 * nall};
-
-  return {virial_shape, atom_virial_shape};
-}
-
-std::vector<paddle::DataType> PdProdVirialSeAInferDtype(
-  paddle::DataType net_deriv_dtype,
-  paddle::DataType in_deriv_dtype,
-  paddle::DataType rij_dtype,
-  paddle::DataType nlist_dtype,
-  paddle::DataType natoms_dtype
-) {
-  return {net_deriv_dtype, net_deriv_dtype};
-}
-
-
-PD_BUILD_OP(prod_virial_se_a)
-    .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
-    .Outputs({"virial", "atom_virial"})
-    .Attrs({"n_a_sel: int", "n_r_sel: int"})
-    .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(PdProdVirialSeAInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(PdProdVirialSeAInferDtype));
-
-
-// PD_BUILD_GRAD_OP(prod_virial_se_a)
-//     .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"})
-//     .Outputs({paddle::Grad("net_deriv")})
-//     .Attrs({"n_a_sel: int", "n_r_sel: int"})
-//     .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward));
-
-
diff --git a/source/lib/paddle_src/prod_virial.h b/source/lib/paddle_src/prod_virial.h
deleted file mode 100644
index c51e333a47..0000000000
--- a/source/lib/paddle_src/prod_virial.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-namespace deepmd {
-
-template <typename FPTYPE>
-void prod_virial_a_cpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_r_cpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-
-#if GOOGLE_CUDA
-template <typename FPTYPE>
-void prod_virial_a_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_r_gpu_cuda(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_a_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_r_gpu_rocm(FPTYPE* virial,
-                            FPTYPE* atom_virial,
-                            const FPTYPE* net_deriv,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nall,
-                            const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
-
-}  // namespace deepmd
diff --git a/source/lib/paddle_src/prod_virial_grad.cc b/source/lib/paddle_src/prod_virial_grad.cc
deleted file mode 100644
index 14ba158cc1..0000000000
--- a/source/lib/paddle_src/prod_virial_grad.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-#include "prod_virial_grad.h"
-
-#include <cstring>
-#include <stdexcept>
-
-#include "errors.h"
-
-inline void make_index_range(int& idx_start,
-                             int& idx_end,
-                             const int& nei_idx,
-                             const int& nnei) {
-  if (nei_idx < nnei) {
-    idx_start = nei_idx * 4;
-    idx_end = nei_idx * 4 + 4;
-  } else {
-    throw deepmd::deepmd_exception("should no reach here");
-  }
-}
-
-template <typename FPTYPE>
-void deepmd::prod_virial_grad_a_cpu(FPTYPE* grad_net,
-                                    const FPTYPE* grad,
-                                    const FPTYPE* env_deriv,
-                                    const FPTYPE* rij,
-                                    const int* nlist,
-                                    const int nloc,
-                                    const int nnei) {
-  const int ndescrpt = nnei * 4;
-
-  // reset the frame to 0
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int aa = 0; aa < ndescrpt; ++aa) {
-      grad_net[ii * ndescrpt + aa] = 0;
-    }
-  }
-
-// compute grad of one frame
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    int i_idx = ii;
-
-    // loop over neighbors
-    for (int jj = 0; jj < nnei; ++jj) {
-      int j_idx = nlist[i_idx * nnei + jj];
-      if (j_idx < 0) continue;
-      int aa_start, aa_end;
-      make_index_range(aa_start, aa_end, jj, nnei);
-      for (int aa = aa_start; aa < aa_end; ++aa) {
-        for (int dd0 = 0; dd0 < 3; ++dd0) {
-          for (int dd1 = 0; dd1 < 3; ++dd1) {
-            grad_net[i_idx * ndescrpt + aa] -=
-                -1.0 * grad[dd0 * 3 + dd1] *
-                rij[i_idx * nnei * 3 + jj * 3 + dd1] *
-                env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0];
-          }
-        }
-      }
-    }
-  }
-}
-
-template void deepmd::prod_virial_grad_a_cpu<double>(double* grad_net,
-                                                     const double* grad,
-                                                     const double* env_deriv,
-                                                     const double* rij,
-                                                     const int* nlist,
-                                                     const int nloc,
-                                                     const int nnei);
-
-template void deepmd::prod_virial_grad_a_cpu<float>(float* grad_net,
-                                                    const float* grad,
-                                                    const float* env_deriv,
-                                                    const float* rij,
-                                                    const int* nlist,
-                                                    const int nloc,
-                                                    const int nnei);
-
-template <typename FPTYPE>
-void deepmd::prod_virial_grad_r_cpu(FPTYPE* grad_net,
-                                    const FPTYPE* grad,
-                                    const FPTYPE* env_deriv,
-                                    const FPTYPE* rij,
-                                    const int* nlist,
-                                    const int nloc,
-                                    const int nnei)
-//
-//	grad_net:	nloc x ndescrpt
-//	grad:		9
-//	env_deriv:	nloc x ndescrpt x 3
-//	rij:		nloc x nnei x 3
-//	nlist:		nloc x nnei
-//
-{
-  const int ndescrpt = nnei * 1;
-
-  // reset the frame to 0
-  for (int ii = 0; ii < nloc; ++ii) {
-    for (int aa = 0; aa < ndescrpt; ++aa) {
-      grad_net[ii * ndescrpt + aa] = 0;
-    }
-  }
-
-// compute grad of one frame
-#pragma omp parallel for
-  for (int ii = 0; ii < nloc; ++ii) {
-    int i_idx = ii;
-
-    // loop over neighbors
-    for (int jj = 0; jj < nnei; ++jj) {
-      int j_idx = nlist[i_idx * nnei + jj];
-      if (j_idx < 0) continue;
-      for (int dd0 = 0; dd0 < 3; ++dd0) {
-        for (int dd1 = 0; dd1 < 3; ++dd1) {
-          grad_net[i_idx * ndescrpt + jj] -=
-              -1.0 * grad[dd0 * 3 + dd1] *
-              rij[i_idx * nnei * 3 + jj * 3 + dd1] *
-              env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0];
-        }
-      }
-    }
-  }
-}
-
-template void deepmd::prod_virial_grad_r_cpu<double>(double* grad_net,
-                                                     const double* grad,
-                                                     const double* env_deriv,
-                                                     const double* rij,
-                                                     const int* nlist,
-                                                     const int nloc,
-                                                     const int nnei);
-
-template void deepmd::prod_virial_grad_r_cpu<float>(float* grad_net,
-                                                    const float* grad,
-                                                    const float* env_deriv,
-                                                    const float* rij,
-                                                    const int* nlist,
-                                                    const int nloc,
-                                                    const int nnei);
diff --git a/source/lib/paddle_src/prod_virial_grad.cu b/source/lib/paddle_src/prod_virial_grad.cu
deleted file mode 100644
index 6205c4bdf8..0000000000
--- a/source/lib/paddle_src/prod_virial_grad.cu
+++ /dev/null
@@ -1,536 +0,0 @@
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_radix_sort.cuh>
-#include <cub/block/block_store.cuh>
-#include "paddle/extension.h"
-
-#include "device.h"
-#include "prod_virial.h"
-#include "gpu_cuda.h"
-
-#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".")
-#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.")
-
-// template <typename FPTYPE, int THREADS_PER_BLOCK>
-// __global__ void atom_virial_reduction(FPTYPE* virial,
-//                                       const FPTYPE* atom_virial,
-//                                       const int nall) {
-//   unsigned int bid = blockIdx.x;
-//   unsigned int tid = threadIdx.x;
-//   __shared__ FPTYPE data[THREADS_PER_BLOCK];
-//   data[tid] = (FPTYPE)0.;
-//   for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) {
-//     data[tid] += atom_virial[ii * 9 + bid];
-//   }
-//   __syncthreads();
-//   // do reduction in shared memory
-//   for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) {
-//     if (tid < ii) {
-//       data[tid] += data[tid + ii];
-//     }
-//     __syncthreads();
-//   }
-//   // write result for this block to global memory
-//   if (tid == 0) virial[bid] = data[0];
-// }
-
-// template <typename FPTYPE>
-// __global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial,
-//                                              FPTYPE* atom_virial,
-//                                              const FPTYPE* net_deriv,
-//                                              const FPTYPE* in_deriv,
-//                                              const FPTYPE* rij,
-//                                              const int* nlist,
-//                                              const int nloc,
-//                                              const int nnei) {
-//   // idx -> nloc
-//   // idy -> nnei
-//   // idz = dd0 * 3 + dd1
-//   // dd0 = idz / 3
-//   // dd1 = idz % 3
-//   const int_64 idx = blockIdx.x;
-//   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-//   const unsigned int idz = threadIdx.y;
-//   const int ndescrpt = nnei * 4;
-//   if (idy >= nnei) {
-//     return;
-//   }
-//   int j_idx = nlist[idx * nnei + idy];
-//   if (j_idx < 0) {
-//     return;
-//   }
-//   // atomicAdd(
-//   //    virial + idz,
-//   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-//   //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-//   //    3]);
-//   FPTYPE virial_tmp = (FPTYPE)0.;
-//   for (int idw = 0; idw < 4; ++idw) {
-//     virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] *
-//                   rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-//                   in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3];
-//   }
-//   atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp);
-// }
-
-// template <typename FPTYPE>
-// __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial,
-//                                              FPTYPE* atom_virial,
-//                                              const FPTYPE* net_deriv,
-//                                              const FPTYPE* in_deriv,
-//                                              const FPTYPE* rij,
-//                                              const int* nlist,
-//                                              const int nloc,
-//                                              const int nnei) {
-//   // idx -> nloc
-//   // idy -> nnei
-//   // idz = dd0 * 3 + dd1
-//   // dd0 = idz / 3
-//   // dd1 = idz % 3
-//   const int_64 idx = blockIdx.x;
-//   const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x;
-//   const unsigned int idz = threadIdx.y;
-//   const int ndescrpt = nnei * 1;
-
-//   if (idy >= nnei) {
-//     return;
-//   }
-//   int j_idx = nlist[idx * nnei + idy];
-//   if (j_idx < 0) {
-//     return;
-//   }
-//   // atomicAdd(
-//   //    virial + idz,
-//   //    net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3
-//   //    + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz %
-//   //    3]);
-//   atomicAdd(atom_virial + j_idx * 9 + idz,
-//             net_deriv[idx * ndescrpt + idy] *
-//                 rij[idx * nnei * 3 + idy * 3 + idz % 3] *
-//                 in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]);
-// }
-
-// namespace deepmd {
-// template <typename FPTYPE>
-// void prod_virial_a_gpu_cuda(FPTYPE* virial,
-//                             FPTYPE* atom_virial,
-//                             const FPTYPE* net_deriv,
-//                             const FPTYPE* in_deriv,
-//                             const FPTYPE* rij,
-//                             const int* nlist,
-//                             const int nloc,
-//                             const int nall,
-//                             const int nnei) {
-//   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-//   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-//   const int LEN = 16;
-//   int nblock = (nnei + LEN - 1) / LEN;
-//   dim3 block_grid(nloc, nblock);
-//   dim3 thread_grid(LEN, 9);
-//   // compute virial of a frame
-//   virial_deriv_wrt_neighbors_a<<<block_grid, thread_grid>>>(
-//       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-//   DPErrcheck(cudaGetLastError());
-//   DPErrcheck(cudaDeviceSynchronize());
-//   // reduction atom_virial to virial
-//   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-//   DPErrcheck(cudaGetLastError());
-//   DPErrcheck(cudaDeviceSynchronize());
-// }
-
-// template <typename FPTYPE>
-// void prod_virial_r_gpu_cuda(FPTYPE* virial,
-//                             FPTYPE* atom_virial,
-//                             const FPTYPE* net_deriv,
-//                             const FPTYPE* in_deriv,
-//                             const FPTYPE* rij,
-//                             const int* nlist,
-//                             const int nloc,
-//                             const int nall,
-//                             const int nnei) {
-//   DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9));
-//   DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall));
-
-//   const int LEN = 16;
-//   int nblock = (nnei + LEN - 1) / LEN;
-//   dim3 block_grid(nloc, nblock);
-//   dim3 thread_grid(LEN, 9);
-//   // compute virial of a frame
-//   virial_deriv_wrt_neighbors_r<<<block_grid, thread_grid>>>(
-//       virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei);
-//   DPErrcheck(cudaGetLastError());
-//   DPErrcheck(cudaDeviceSynchronize());
-//   // reduction atom_virial to virial
-//   atom_virial_reduction<FPTYPE, TPB><<<9, TPB>>>(virial, atom_virial, nall);
-//   DPErrcheck(cudaGetLastError());
-//   DPErrcheck(cudaDeviceSynchronize());
-// }
-// }  // namespace deepmd
-
-// template <typename data_t>
-// void PdProdVirialSeAOpForwardCUDAKernel(
-//     int nloc, int nall, int ndescrpt, int nnei, int nframes,
-//     data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv,
-//     const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){
-
-//     for(int kk = 0; kk < nframes; ++kk){
-//       data_t * virial = p_virial + kk * 9;
-//       data_t * atom_virial = p_atom_virial + kk * nall * 9;
-//       const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt;
-//       const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-//       const data_t * rij = p_rij + kk * nloc * nnei * 3;
-//       const int * nlist = p_nlist + kk * nloc * nnei;
-//       deepmd::prod_virial_a_gpu_cuda(
-//           virial, atom_virial,
-//           net_deriv, in_deriv, rij, nlist, nloc, nall, nnei);
-//     }
-// }
-
-// std::vector<paddle::Tensor> PdProdVirialSeAOpCUDAForward(
-// const paddle::Tensor& net_deriv_tensor,
-// const paddle::Tensor& in_deriv_tensor,
-// const paddle::Tensor& rij_tensor,
-// const paddle::Tensor& nlist_tensor,
-// const paddle::Tensor& natoms_tensor,
-// int n_a_sel,
-// int n_r_sel
-// ){
-//     CHECK_INPUT(net_deriv_tensor);
-//     CHECK_INPUT(in_deriv_tensor);
-//     CHECK_INPUT(rij_tensor);
-//     CHECK_INPUT(nlist_tensor);
-//     // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu，gpu的copy_to会导致返回的指针数据不对
-
-//     CHECK_INPUT_DIM(net_deriv_tensor, 2);
-//     CHECK_INPUT_DIM(in_deriv_tensor, 2);
-//     CHECK_INPUT_DIM(rij_tensor, 2);
-//     CHECK_INPUT_DIM(nlist_tensor, 2);
-//     CHECK_INPUT_DIM(natoms_tensor, 1);
-
-//     PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3");
-//     const int* natoms = natoms_tensor.data<int>();
-//     // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel());
-//     int nloc = natoms[0];
-//     // printf("nloc = %d\n", nloc);
-//     int nall = natoms[1];
-//     // printf("nall = %d\n", nall);
-//     int nnei = nlist_tensor.shape()[1] / nloc;
-//     int nframes = net_deriv_tensor.shape()[0];
-//     int ndescrpt = net_deriv_tensor.shape()[1] / nloc;
-//     PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match");
-//     PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match");
-//     PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match");
-//     PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match");
-//     PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3");
-
-//     std::vector<int64_t> virial_shape {nframes, 9};
-//     std::vector<int64_t> atom_virial_shape {nframes, 9 * nall};
-//     paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape);
-//     paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape);
-
-//     PD_DISPATCH_FLOATING_TYPES(
-//       net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] {
-//         PdProdVirialSeAOpForwardCUDAKernel<data_t>(
-//             nloc, nall, ndescrpt, nnei, nframes,
-//             virial_tensor.mutable_data<data_t>(), atom_virial_tensor.mutable_data<data_t>(),
-//             net_deriv_tensor.data<data_t>(), in_deriv_tensor.data<data_t>(),
-//             rij_tensor.data<data_t>(), nlist_tensor.data<int>());
-//       }));
-
-//     return {virial_tensor, atom_virial_tensor};
-// }
-
-// std::vector<paddle::Tensor> PdProdVirialSeAForward(
-//   const paddle::Tensor& net_deriv_tensor,
-//   const paddle::Tensor& in_deriv_tensor,
-//   const paddle::Tensor& rij_tensor,
-//   const paddle::Tensor& nlist_tensor,
-//   const paddle::Tensor& natoms_tensor,
-//   int n_a_sel,
-//   int n_r_sel
-// ) {
-//   if (net_deriv_tensor.is_gpu()) {
-//     // std::cout << natoms_tensor.dtype() << std::endl;
-//     // const int* natoms = natoms_tensor.data<int>();
-//     // printf("%d\n", natoms[0]);
-//     return PdProdVirialSeAOpCUDAForward(
-//       net_deriv_tensor,
-//       in_deriv_tensor,
-//       rij_tensor,
-//       nlist_tensor,
-//       natoms_tensor,
-//       n_a_sel,
-//       n_r_sel
-//     );
-//   } else {
-//     PD_THROW("Unsupported device type for forward function of custom relu operator.");
-//   }
-// }
-
-
-/*以下是反向代码*/
-
-// template <typename data_t>
-// void PdProdForceSeAOpCPUBackwardKernel(
-//     int nloc, int nframes, int ndescrpt, int nnei,
-//     const data_t* grad, const data_t* net_deriv,
-//     const data_t* in_deriv, const data_t* rij, const int* nlist,
-//     data_t* grad_net){
-
-// #pragma omp parallel for
-//     for (int kk = 0; kk < nframes; ++kk){
-
-//       int grad_iter	= kk * 9;
-//       int in_iter	= kk * nloc * ndescrpt * 3;
-//       int rij_iter	= kk * nloc * nnei * 3;
-//       int nlist_iter	= kk * nloc * nnei;
-//       int grad_net_iter	= kk * nloc * ndescrpt;
-//       // int n_a_shift = n_a_sel * 4;
-
-//       deepmd::prod_virial_grad_a_cpu(
-//         &grad_net[grad_net_iter],
-//         &grad[grad_iter],
-//         &in_deriv[in_iter],
-//         &rij[rij_iter],
-//         &nlist[nlist_iter],
-//         nloc,
-//         nnei);
-//     }
-// }
-
-
-template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) {
-  FPTYPE result = (FPTYPE)0.0;
-  for (int ii = 0; ii < 9; ii++) {
-    result += arr1[ii] * arr2[ii];
-  }
-  return result;
-}
-
-
-template <typename FPTYPE>
-__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net,
-                                            const FPTYPE* grad,
-                                            const FPTYPE* env_deriv,
-                                            const FPTYPE* rij,
-                                            const int* nlist,
-                                            const int nloc,
-                                            const int nnei) {
-  // idy -> nnei
-  const unsigned int tid = threadIdx.x;
-  const int_64 idx = blockIdx.x * blockDim.x + tid;
-  const unsigned int idy = blockIdx.y;
-  const unsigned int idw = threadIdx.y;
-  const int ndescrpt = nnei * 4;
-  __shared__ FPTYPE grad_one[9];
-  if (tid < 9) {
-    grad_one[tid] = grad[tid];
-  }
-  __syncthreads();
-  if (idx >= nloc) {
-    return;
-  }
-  int j_idx = nlist[idx * nnei + idy];
-  if (j_idx < 0) {
-    return;
-  }
-  FPTYPE tmp[9];
-  for (int dd0 = 0; dd0 < 3; ++dd0) {
-    for (int dd1 = 0; dd1 < 3; ++dd1) {
-      tmp[dd0 * 3 + dd1] =
-          rij[idx * nnei * 3 + idy * 3 + dd1] *
-          env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0];
-    }
-  }
-  grad_net[idx * ndescrpt + idy * 4 + idw] -=
-      (FPTYPE)-1.0 * dev_dot9(grad_one, tmp);
-}
-
-
-namespace deepmd {
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei) {
-  const int ndescrpt = nnei * 4;
-  DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt));
-  const int LEN = 128;
-  const int nblock = (nloc + LEN - 1) / LEN;
-  dim3 block_grid(nblock, nnei);
-  dim3 thread_grid(LEN, 4);
-  virial_grad_wrt_neighbors_a<<<block_grid, thread_grid>>>(
-      grad_net, grad, env_deriv, rij, nlist, nloc, nnei);
-  DPErrcheck(cudaGetLastError());
-  DPErrcheck(cudaDeviceSynchronize());
-}
-
-
-template void prod_virial_grad_a_gpu_cuda<float>(float* grad_net,
-                                                 const float* grad,
-                                                 const float* env_deriv,
-                                                 const float* rij,
-                                                 const int* nlist,
-                                                 const int nloc,
-                                                 const int nnei);
-template void prod_virial_grad_a_gpu_cuda<double>(double* grad_net,
-                                                  const double* grad,
-                                                  const double* env_deriv,
-                                                  const double* rij,
-                                                  const int* nlist,
-                                                  const int nloc,
-                                                  const int nnei);
-}  // namespace deepmd
-
-
-template <typename data_t>
-void PdProdForceSeAOpGPUBackwardKernel(
-  int nloc, int nframes, int ndescrpt, int nnei,
-  const data_t* virial_grad, const data_t* net_deriv,
-  const data_t* in_deriv, const data_t* rij, const int* nlist,
-  data_t* grad_net
-) {
-    data_t* p_grad_net = grad_net;
-    const data_t* p_grad = virial_grad;
-    const data_t* p_in_deriv = in_deriv;
-    const data_t* p_rij = rij;
-    const int* p_nlist = nlist;
-    for (int_64 kk = 0; kk < nframes; ++kk) {
-      data_t* grad_net = p_grad_net + kk * nloc * ndescrpt;
-      const data_t* virial_grad = p_grad + kk * 9;
-      const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3;
-      const data_t* rij = p_rij + kk * nloc * nnei * 3;
-      const int* nlist = p_nlist + kk * nloc * nnei;
-      // if (device == "GPU") {
-      deepmd::prod_virial_grad_a_gpu_cuda(
-        grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei
-      );
-      // }
-    }
-}
-
-
-std::vector<paddle::Tensor> PdProdVirialSeAOpCUDABackward(
-  const paddle::Tensor& virial_grad_tensor,
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& rij_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-){
-    // CHECK_INPUT_READY(virial_grad_tensor);
-    // CHECK_INPUT_READY(net_deriv_tensor);
-    // CHECK_INPUT_READY(in_deriv_tensor);
-    // CHECK_INPUT_READY(rij_tensor);
-    // CHECK_INPUT_READY(nlist_tensor);
-    // CHECK_INPUT_READY(natoms_tensor);
-
-    auto grad_shape = virial_grad_tensor.shape();
-    auto net_deriv_shape = net_deriv_tensor.shape();
-    auto in_deriv_shape = in_deriv_tensor.shape();
-    auto rij_shape = rij_tensor.shape();
-    auto nlist_shape = nlist_tensor.shape();
-    auto natoms_shape = natoms_tensor.shape();
-
-    CHECK_INPUT_DIM(virial_grad_tensor, 2);
-    CHECK_INPUT_DIM(net_deriv_tensor, 2);
-    CHECK_INPUT_DIM(in_deriv_tensor, 2);
-    CHECK_INPUT_DIM(rij_tensor, 2);
-    CHECK_INPUT_DIM(nlist_tensor, 2);
-    CHECK_INPUT_DIM(natoms_tensor, 1);
-
-    PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3");
-
-    const int* natoms = nullptr;
-    // if(natoms_tensor.place() != paddle::PlaceType::kCPU){
-    //     natoms = natoms_tensor.copy_to<int>(paddle::PlaceType::kCPU).data<int>();
-    // }else{
-    natoms = natoms_tensor.data<int>();
-    // }
-    int nframes = net_deriv_shape[0];
-    int nloc = natoms[0];
-    int ndescrpt = net_deriv_shape[1] / nloc;
-    int nnei = nlist_shape[1] / nloc;
-
-    PD_CHECK(nframes == grad_shape[0], "number of frames should match");
-    PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match");
-    PD_CHECK(nframes == rij_shape[0], "number of frames should match");
-    PD_CHECK(nframes == nlist_shape[0],"number of samples should match");
-    PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms");
-    PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match");
-    PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be  nnei * 3");
-    PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match");
-
-    std::vector<int64_t> grad_net_shape {nframes, nloc * ndescrpt};
-    // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape);
-    paddle::Tensor grad_net_tensor = paddle::empty(
-      grad_net_shape,
-      virial_grad_tensor.dtype(),
-      virial_grad_tensor.place()
-    );
-
-    // if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){
-    //     PD_DISPATCH_FLOATING_TYPES(
-    //     virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] {
-    //         PdProdForceSeAOpCPUBackwardKernel<data_t>(
-    //             nloc, nframes, ndescrpt, nnei,
-    //             virial_grad_tensor.data<data_t>(),
-    //             net_deriv_tensor.data<data_t>(),
-    //             in_deriv_tensor.data<data_t>(),
-    //             rij_tensor.data<data_t>(), nlist_tensor.data<int>(),
-    //             grad_net_tensor.mutable_data<data_t>());
-    //     }));
-    // }else{
-      PD_DISPATCH_FLOATING_TYPES(
-      virial_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] {
-          PdProdForceSeAOpGPUBackwardKernel<data_t>(
-              nloc, nframes, ndescrpt, nnei,
-              virial_grad_tensor.data<data_t>(),
-              net_deriv_tensor.data<data_t>(),
-              in_deriv_tensor.data<data_t>(),
-              rij_tensor.data<data_t>(),
-              nlist_tensor.data<int>(),
-              grad_net_tensor.mutable_data<data_t>());
-      }));
-    // }
-    return {grad_net_tensor};
-}
-
-
-std::vector<paddle::Tensor> PdProdVirialSeABackward(
-  const paddle::Tensor& virial_grad_tensor,
-  const paddle::Tensor& net_deriv_tensor,
-  const paddle::Tensor& in_deriv_tensor,
-  const paddle::Tensor& rij_tensor,
-  const paddle::Tensor& nlist_tensor,
-  const paddle::Tensor& natoms_tensor,
-  int n_a_sel,
-  int n_r_sel
-) {
-    return PdProdVirialSeAOpCUDABackward(
-        virial_grad_tensor, net_deriv_tensor, in_deriv_tensor,
-        rij_tensor,
-        nlist_tensor, natoms_tensor, n_a_sel, n_r_sel
-    );
-}
-
-
-// PD_BUILD_OP(prod_virial_se_a)
-//     .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"})
-//     .Outputs({"virial", "atom_virial"})
-//     .Attrs({"n_a_sel: int", "n_r_sel: int"})
-//     .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward));
-
-
-PD_BUILD_GRAD_OP(prod_virial_se_a)
-    .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"})
-    .Outputs({paddle::Grad("net_deriv")})
-    .Attrs({"n_a_sel: int", "n_r_sel: int"})
-    .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward));
diff --git a/source/lib/paddle_src/prod_virial_grad.h b/source/lib/paddle_src/prod_virial_grad.h
deleted file mode 100644
index 0e2cc46baa..0000000000
--- a/source/lib/paddle_src/prod_virial_grad.h
+++ /dev/null
@@ -1,63 +0,0 @@
-#pragma once
-
-namespace deepmd {
-
-template <typename FPTYPE>
-void prod_virial_grad_a_cpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_grad_r_cpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-
-#if GOOGLE_CUDA
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net,
-                                 const FPTYPE* grad,
-                                 const FPTYPE* env_deriv,
-                                 const FPTYPE* rij,
-                                 const int* nlist,
-                                 const int nloc,
-                                 const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
-
-}  // namespace deepmd
diff --git a/source/lib/paddle_src/setup_ins.py b/source/lib/paddle_src/setup_ins.py
index f510bd62f4..402b47cbc5 100644
--- a/source/lib/paddle_src/setup_ins.py
+++ b/source/lib/paddle_src/setup_ins.py
@@ -2,7 +2,7 @@
 
 cpp_extension.setup(
     name="paddle_deepmd_lib",
-    ext_modules=cpp_extension.CppExtension(
+    ext_modules=cpp_extension.CUDAExtension(
         sources=[
             "../src/coord.cc",
             "../src/env_mat_nvnmd.cc",
@@ -14,11 +14,11 @@
             "../src/neighbor_list.cc",
             "../src/pair_tab.cc",
             "../src/prod_env_mat_nvnmd.cc",
-            "../src/prod_env_mat.cc",
-            # "../src/prod_force_grad.cc",
-            # "../src/prod_force.cc",
-            # "../src/prod_virial_grad.cc",
-            # "../src/prod_virial.cc",
+            # "../src/prod_env_mat.cc",
+            "../src/prod_force_grad.cc",
+            "../src/prod_force.cc",
+            "../src/prod_virial_grad.cc",
+            "../src/prod_virial.cc",
             "../src/region.cc",
             "../src/SimulationRegion.cpp",
             "../src/soft_min_switch_force_grad.cc",
@@ -37,18 +37,22 @@
             # "../src/cuda/prod_virial.cu",
             "../src/cuda/region.cu",
             "../src/cuda/tabulate.cu",
-            "./prod_env_mat.cu",
-            "./prod_virial_grad.cu",
-            "./prod_virial_grad.cc",
-            "./prod_virial.cu",
-            "./prod_force.cu",
-            # "./prod_force_grad.cc",
-            "./prod_force_grad.cu",
-            "./neighbor_stat.cu",
+            "./paddle_prod_env_mat.cu",
+            "./paddle_prod_env_mat.cc",
+            "./paddle_prod_virial_grad.cu",
+            "./paddle_prod_virial_grad.cc",
+            "./paddle_prod_virial.cu",
+            "./paddle_prod_virial.cc",
+            "./paddle_prod_force.cu",
+            "./paddle_prod_force.cc",
+            "./paddle_prod_force_grad.cu",
+            "./paddle_prod_force_grad.cc",
+            "./paddle_neighbor_stat.cc",
         ],
         include_dirs=[
             "/workspace/hesensen/deepmd_backend/deepmd-kit-tf/source/lib/include"
         ],
         library_dirs=["/usr/local/cuda-11/lib64"],
+        # extra_link_args=["-ldeepmd"]
     ),
 )
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index af82a09c2e..4dc38dd4ef 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -269,7 +269,7 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
   memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
   memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
   const int ago = mesh_host[0];
-  if (ago == 0 || gpu_inlist.inum < inlist.inum) {
+  if (ago == 0 || gpu_inlist.inum < inlist.inum || !gpu_inlist.ilist) {
     const int inum = inlist.inum;
     if (gpu_inlist.inum < inum) {
       delete_device_memory(gpu_inlist.ilist);
@@ -279,6 +279,15 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
       malloc_device_memory(gpu_inlist.numneigh, inum);
       malloc_device_memory(gpu_inlist.firstneigh, inum);
     }
+    if (!gpu_inlist.ilist) {
+      malloc_device_memory(gpu_inlist.ilist, inum);
+    }
+    if (!gpu_inlist.numneigh) {
+      malloc_device_memory(gpu_inlist.numneigh, inum);
+    }
+    if (!gpu_inlist.firstneigh) {
+      malloc_device_memory(gpu_inlist.firstneigh, inum);
+    }
     memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum);
     memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum);
     int _max_nbor_size = max_numneigh(inlist);
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index ccf95c0984..8922260e5f 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -833,7 +833,8 @@ void PairDeepMD::settings(int narg, char **arg) {
   numb_models = models.size();
   if (numb_models == 1) {
     try {
-      deep_pot.init(arg[0], get_node_rank(), get_file_content(arg[0]));
+      auto ptr = strstr(arg[0], ".pb");
+      deep_pot.init(arg[0], get_node_rank(), (ptr != NULL) ? get_file_content(arg[0]) : "");
     } catch (deepmd_compat::deepmd_exception &e) {
       error->one(FLERR, e.what());
     }