From dd604c238ed458f92c563d5b091d5625cbe5cc46 Mon Sep 17 00:00:00 2001
From: CaRoLZhangxy <ucaszxy@outlook.com>
Date: Sat, 9 Mar 2024 03:40:29 +0000
Subject: [PATCH 1/7] support input mapping for DPA2 model

---
 source/api_cc/src/DeepPotPT.cc | 1 -
 source/lmp/pair_deepmd.cpp     | 9 ++++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 919d690bed..2c3fd1d865 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -116,7 +116,6 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
   select_real_atoms_coord(dcoord, datype, aparam_, nghost_real, fwd_map,
                           bkw_map, nall_real, nloc_real, coord, atype, aparam,
                           nghost, ntypes, 1, daparam, nall, aparam_nall);
-  std::cout << datype.size() << std::endl;
   std::vector<VALUETYPE> coord_wrapped = dcoord;
   at::Tensor coord_wrapped_Tensor =
       torch::from_blob(coord_wrapped.data(), {1, nall_real, 3}, options)
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 90aa453143..51362c1563 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -462,6 +462,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
 
   double **x = atom->x;
   double **f = atom->f;
+  int *tag_array = atom->tag;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = 0;
@@ -470,7 +471,11 @@ void PairDeepMD::compute(int eflag, int vflag) {
   }
   int nall = nlocal + nghost;
   int newton_pair = force->newton_pair;
-
+  //make mapping array
+  std::vector<int> mapping(nall);
+  for (int i = 0; i < nall; ++i) {
+    mapping[i] = atom->map(tag_array[i]);
+  }
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
   double **sp = atom->sp;
@@ -1280,6 +1285,8 @@ void PairDeepMD::coeff(int narg, char **arg) {
 void PairDeepMD::init_style() {
 #if LAMMPS_VERSION_NUMBER >= 20220324
   neighbor->add_request(this, NeighConst::REQ_FULL);
+  atom->map_user=2;
+  atom->map_init(1);
 #else
   int irequest = neighbor->request(this, instance_me);
   neighbor->requests[irequest]->half = 0;

From 91aa4107753050fe0929a833d666cf8e27b02467 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 9 Mar 2024 03:43:36 +0000
Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/lmp/pair_deepmd.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 51362c1563..4a150f4324 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -471,7 +471,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
   }
   int nall = nlocal + nghost;
   int newton_pair = force->newton_pair;
-  //make mapping array
+  // make mapping array
   std::vector<int> mapping(nall);
   for (int i = 0; i < nall; ++i) {
     mapping[i] = atom->map(tag_array[i]);
@@ -1285,7 +1285,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
 void PairDeepMD::init_style() {
 #if LAMMPS_VERSION_NUMBER >= 20220324
   neighbor->add_request(this, NeighConst::REQ_FULL);
-  atom->map_user=2;
+  atom->map_user = 2;
   atom->map_init(1);
 #else
   int irequest = neighbor->request(this, instance_me);

From e1e8420bb836dc5e99391ed710bc969d430dad74 Mon Sep 17 00:00:00 2001
From: CaRoLZhangxy <ucaszxy@outlook.com>
Date: Mon, 11 Mar 2024 07:41:32 +0000
Subject: [PATCH 3/7] comm op init

---
 deepmd/pt/utils/env.py      |  3 +++
 source/CMakeLists.txt       |  4 ++++
 source/lmp/pair_deepmd.cpp  | 13 +++++++++----
 source/op_pt/CMakeLists.txt | 23 +++++++++++++++++++++++
 4 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 source/op_pt/CMakeLists.txt

diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
index 0b92953255..c9b54086f6 100644
--- a/deepmd/pt/utils/env.py
+++ b/deepmd/pt/utils/env.py
@@ -80,3 +80,6 @@
     "ENERGY_BIAS_TRAINABLE",
     "LOCAL_RANK",
 ]
+
+def load_op():
+    torch.ops.load_library("/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/dp/lib/")
\ No newline at end of file
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 931013016d..5c8d8b1260 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -219,6 +219,7 @@ endif()
 # define names of libs
 set(LIB_DEEPMD "deepmd")
 set(LIB_DEEPMD_OP "deepmd_op")
+set(LIB_DEEPMD_OP_PT "deepmd_op_pt")
 if(BUILD_CPP_IF)
   set(LIB_DEEPMD_CC "deepmd_cc")
   set(LIB_DEEPMD_C "deepmd_c")
@@ -281,6 +282,9 @@ if(NOT DEEPMD_C_ROOT)
   if(ENABLE_TENSORFLOW)
     add_subdirectory(op/)
   endif()
+  if(ENABLE_PYTORCH)
+    add_subdirectory(op_pt/)
+  endif()
   add_subdirectory(lib/)
 endif()
 if(BUILD_PY_IF)
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 4a150f4324..bf009a948c 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -462,7 +462,6 @@ void PairDeepMD::compute(int eflag, int vflag) {
 
   double **x = atom->x;
   double **f = atom->f;
-  int *tag_array = atom->tag;
   int *type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = 0;
@@ -471,10 +470,16 @@ void PairDeepMD::compute(int eflag, int vflag) {
   }
   int nall = nlocal + nghost;
   int newton_pair = force->newton_pair;
-  // make mapping array
+
+  int comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &comm_size); 
+  int *tag_array = atom->tag;
   std::vector<int> mapping(nall);
-  for (int i = 0; i < nall; ++i) {
-    mapping[i] = atom->map(tag_array[i]);
+  // make mapping array
+  if(comm_size == 1){
+    for (int i = 0; i < nall; ++i) {
+      mapping[i] = atom->map(tag_array[i]);
+    }
   }
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
diff --git a/source/op_pt/CMakeLists.txt b/source/op_pt/CMakeLists.txt
new file mode 100644
index 0000000000..b26aa4733e
--- /dev/null
+++ b/source/op_pt/CMakeLists.txt
@@ -0,0 +1,23 @@
+# libop
+
+file(
+  GLOB
+  OP_PT_SRC
+  comm.cc
+)
+add_library(${LIB_DEEPMD_OP_PT} MODULE ${OP_PT_SRC})
+target_link_libraries(${libname} PRIVATE )
+target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE "${TORCH_LIBRARIES}")
+target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE ${LIB_DEEPMD})
+
+if(CMAKE_TESTING_ENABLED)
+  target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE coverage_config)
+endif()
+
+
+
+if(BUILD_PY_IF)
+  install(TARGETS ${LIB_DEEPMD_OP_PT} DESTINATION deepmd/lib/)
+else(BUILD_PY_IF)
+  install(TARGETS ${LIB_DEEPMD_OP_PT} DESTINATION lib/)
+endif(BUILD_PY_IF)

From 225d7629274e08ca5c303826d7642893395b1ebb Mon Sep 17 00:00:00 2001
From: CaRoLZhangxy <ucaszxy@outlook.com>
Date: Tue, 12 Mar 2024 08:51:47 +0000
Subject: [PATCH 4/7] get commbrick from lammps

---
 source/lmp/pair_deepmd.cpp  | 18 +++++++++++++++++-
 source/lmp/pair_deepmd.h    |  7 ++++++-
 source/op_pt/CMakeLists.txt |  4 +++-
 source/op_pt/custom_op.h    |  2 ++
 4 files changed, 28 insertions(+), 3 deletions(-)
 create mode 100644 source/op_pt/custom_op.h

diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index bf009a948c..82ba95232a 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -481,6 +481,19 @@ void PairDeepMD::compute(int eflag, int vflag) {
       mapping[i] = atom->map(tag_array[i]);
     }
   }
+  // for dpa2 communication
+  int nswap = cb->nswap;
+  int* sendnum = cb->sendnum; // dim: nswap
+  int* recvnum = cb->recvnum; // dim: nswap
+  int* firstrecv = cb->firstrecv; // dim: nswap
+  int** sendlist = cb->sendlist; // dim: nswap x sendnum[nswap]
+  int* sendproc = cb->sendproc; // dim: nswap
+  int* recvproc = cb->recvproc; // dim: nswap
+  int** pbc = cb->pbc; // dim: nswap x 3
+  int* maxneed = cb->maxneed; // dim: 3
+  auto* recvneed = cb->recvneed; // int array, dim: 3x2
+  auto* sendneed = cb->sendneed; // int array, dim: 3x2
+  double* prd = domain->prd;
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
   double **sp = atom->sp;
@@ -946,7 +959,7 @@ void PairDeepMD::settings(int narg, char **arg) {
   if (narg <= 0) {
     error->all(FLERR, "Illegal pair_style command");
   }
-
+  
   vector<string> models;
   int iarg = 0;
   while (iarg < narg) {
@@ -1285,6 +1298,9 @@ void PairDeepMD::coeff(int narg, char **arg) {
       }
     }
   }
+
+  //dpa2 communication
+  cb = (CommBrickDeepMD *)comm;
 }
 
 void PairDeepMD::init_style() {
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index cd72dc7b2a..55ddb515de 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -32,10 +32,14 @@ namespace deepmd_compat = deepmd::hpp;
 #include <iostream>
 #include <map>
 
+#include "comm_brick.h"
+
 #define FLOAT_PREC double
 
 namespace LAMMPS_NS {
-
+class CommBrickDeepMD : public CommBrick{
+  friend class PairDeepMD;
+};
 class PairDeepMD : public Pair {
  public:
   PairDeepMD(class LAMMPS *);
@@ -137,6 +141,7 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
+  CommBrickDeepMD* cb;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/op_pt/CMakeLists.txt b/source/op_pt/CMakeLists.txt
index b26aa4733e..d24cfc763f 100644
--- a/source/op_pt/CMakeLists.txt
+++ b/source/op_pt/CMakeLists.txt
@@ -5,9 +5,11 @@ file(
   OP_PT_SRC
   comm.cc
 )
+find_package(MPI REQUIRED)
+
 add_library(${LIB_DEEPMD_OP_PT} MODULE ${OP_PT_SRC})
-target_link_libraries(${libname} PRIVATE )
 target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE "${TORCH_LIBRARIES}")
+target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE MPI::MPI_CXX)
 target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE ${LIB_DEEPMD})
 
 if(CMAKE_TESTING_ENABLED)
diff --git a/source/op_pt/custom_op.h b/source/op_pt/custom_op.h
new file mode 100644
index 0000000000..e8724181da
--- /dev/null
+++ b/source/op_pt/custom_op.h
@@ -0,0 +1,2 @@
+#include <torch/torch.h>
+#include <mpi.h>
\ No newline at end of file

From b78f4570e78e17e55e4b7110932401449fc10954 Mon Sep 17 00:00:00 2001
From: CaRoLZhangxy <ucaszxy@outlook.com>
Date: Thu, 14 Mar 2024 10:49:39 +0000
Subject: [PATCH 5/7] able to pass comm data

---
 source/api_c/include/c_api.h       | 13 +++++++
 source/api_c/include/deepmd.hpp    | 25 ++++++++++++-
 source/api_c/src/c_api.cc          | 19 +++++++++-
 source/api_cc/include/common.h     |  2 +-
 source/lib/include/neighbor_list.h | 56 ++++++++++++++++++++++++++++--
 source/lmp/pair_deepmd.cpp         | 35 +++++++------------
 6 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index 911813e428..ea8e933f70 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -25,6 +25,19 @@ extern DP_Nlist* DP_NewNlist(int inum_,
                              int* numneigh_,
                              int** firstneigh_);
 
+extern DP_Nlist* DP_NewNlist_comm(int inum_,
+                                  int* ilist_,
+                                  int* numneigh_,
+                                  int** firstneigh_,
+                                  int nswap,
+                                  int* sendnum,
+                                  int* recvnum,
+                                  int* firstrecv,
+                                  int** sendlist,
+                                  int* sendproc,
+                                  int* recvproc,
+                                  long int* world);
+
 /**
  * @brief Delete a neighbor list.
  *
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 16b8f08cad..dc03dc393c 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -502,6 +502,20 @@ inline double *_DP_Get_Energy_Pointer(double &vec, const int nframes) {
 
 namespace deepmd {
 namespace hpp {
+    struct CommData {
+    int nswap;
+    int* sendnum;
+    int* recvnum;
+    int* firstrecv;
+    int** sendlist;
+    int* sendproc;
+    int* recvproc;
+    long int* world;
+
+    CommData() : nswap(0), sendnum(nullptr), recvnum(nullptr),
+                 firstrecv(nullptr), sendlist(nullptr),
+                 sendproc(nullptr), recvproc(nullptr),world(nullptr) {}
+};
 /**
  * @brief Neighbor list.
  **/
@@ -522,6 +536,14 @@ struct InputNlist {
         nl(DP_NewNlist(inum_, ilist_, numneigh_, firstneigh_)) {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
+  InputNlist(int inum_, int *ilist_, int *numneigh_, int **firstneigh_, CommData *commdata_)
+      : inum(inum_),
+        ilist(ilist_),
+        numneigh(numneigh_),
+        firstneigh(firstneigh_),
+        nl(DP_NewNlist_comm(inum_, ilist_, numneigh_, firstneigh_,commdata_->nswap,commdata_->sendnum,commdata_->recvnum,commdata_->firstrecv,commdata_->sendlist,commdata_->sendproc,commdata_->recvproc,commdata_->world)) {
+    DP_CHECK_OK(DP_NlistCheckOK, nl);
+  };
   ~InputNlist() { DP_DeleteNlist(nl); };
   /// @brief C API neighbor list.
   DP_Nlist *nl;
@@ -535,6 +557,8 @@ struct InputNlist {
   int **firstneigh;
 };
 
+
+
 /**
  * @brief Convert pbtxt to pb.
  * @param[in] fn_pb_txt Filename of the pb txt file.
@@ -2117,6 +2141,5 @@ void select_map(std::vector<VT> &out,
   out.resize(static_cast<size_t>(nall2) * stride);
   DP_SelectMapInt(&in[0], &fwd_map[0], stride, nall1, nall2, &out[0]);
 };
-
 }  // namespace hpp
 }  // namespace deepmd
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index 79dc486e0d..b74e24a2f4 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -24,6 +24,24 @@ DP_Nlist* DP_NewNlist(int inum_,
             deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_);
             DP_Nlist* new_nl = new DP_Nlist(nl); return new_nl;)
 }
+DP_Nlist* DP_NewNlist_comm(int inum_,
+                           int* ilist_,
+                           int* numneigh_,
+                           int** firstneigh_,
+                           int nswap,
+                           int* sendnum,
+                           int* recvnum,
+                           int* firstrecv,
+                           int** sendlist,
+                           int* sendproc,
+                           int* recvproc,
+                           long int* world) {
+  deepmd::CommData commdata(nswap, sendnum, recvnum, firstrecv, sendlist,
+                            sendproc, recvproc, world);
+  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_, &commdata);
+  DP_Nlist* new_nl = new DP_Nlist(nl);
+  return new_nl;
+}
 
 void DP_DeleteNlist(DP_Nlist* nl) { delete nl; }
 
@@ -268,7 +286,6 @@ inline void DP_DeepPotComputeNList_variant(DP_DeepPot* dp,
   }
   std::vector<double> e;
   std::vector<VALUETYPE> f, v, ae, av;
-
   DP_REQUIRES_OK(dp, dp->dp.compute(e, f, v, ae, av, coord_, atype_, cell_,
                                     nghost, nlist->nl, ago, fparam_, aparam_));
   // copy from C++ vectors to C arrays, if not NULL pointer
diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h
index 4743336e0c..2901e9aafc 100644
--- a/source/api_cc/include/common.h
+++ b/source/api_cc/include/common.h
@@ -13,7 +13,7 @@
 namespace deepmd {
 
 typedef double ENERGYTYPE;
-// TODO: currently we only implement TF; reserve for future use
+// TODO: currently we only implement TF&PT; reserve for future use
 enum DPBackend { TensorFlow, PyTorch, Paddle, Unknown };
 
 struct NeighborListData {
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index eb510eb25b..45101e3fdc 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -11,6 +11,41 @@
 #include "utilities.h"
 
 namespace deepmd {
+struct CommData {
+  int nswap;
+  int* sendnum;
+  int* recvnum;
+  int* firstrecv;
+  int** sendlist;
+  int* sendproc;
+  int* recvproc;
+  long int* world;
+
+  CommData()
+      : nswap(0),
+        sendnum(nullptr),
+        recvnum(nullptr),
+        firstrecv(nullptr),
+        sendlist(nullptr),
+        sendproc(nullptr),
+        recvproc(nullptr),
+        world(nullptr){};
+  CommData(int nswap,
+           int* sendnum,
+           int* recvnum,
+           int* firstrecv,
+           int** sendlist,
+           int* sendproc,
+           int* recvproc,
+           long int* world)
+      : nswap(nswap),
+        sendnum(sendnum),
+        recvnum(recvnum),
+        firstrecv(firstrecv),
+        sendlist(sendlist),
+        recvproc(recvproc),
+        world(world) {}
+};
 
 /**
  * @brief             Construct InputNlist with the input LAMMPS nbor list info.
@@ -26,12 +61,29 @@ struct InputNlist {
   int* numneigh;
   /// Array stores the core region atom's neighbor index
   int** firstneigh;
-  InputNlist() : inum(0), ilist(NULL), numneigh(NULL), firstneigh(NULL){};
+  CommData* commdata;
+  InputNlist()
+      : inum(0),
+        ilist(NULL),
+        numneigh(NULL),
+        firstneigh(NULL),
+        commdata(NULL){};
   InputNlist(int inum_, int* ilist_, int* numneigh_, int** firstneigh_)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
-        firstneigh(firstneigh_){};
+        firstneigh(firstneigh_),
+        commdata(NULL){};
+  InputNlist(int inum_,
+             int* ilist_,
+             int* numneigh_,
+             int** firstneigh_,
+             CommData* commdata_)
+      : inum(inum_),
+        ilist(ilist_),
+        numneigh(numneigh_),
+        firstneigh(firstneigh_),
+        commdata(commdata_){};
   ~InputNlist(){};
 };
 
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 82ba95232a..052b9d1b67 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -471,28 +471,16 @@ void PairDeepMD::compute(int eflag, int vflag) {
   int nall = nlocal + nghost;
   int newton_pair = force->newton_pair;
 
-  int comm_size;
-  MPI_Comm_size(MPI_COMM_WORLD, &comm_size); 
-  int *tag_array = atom->tag;
-  std::vector<int> mapping(nall);
-  // make mapping array
-  if(comm_size == 1){
-    for (int i = 0; i < nall; ++i) {
-      mapping[i] = atom->map(tag_array[i]);
-    }
-  }
   // for dpa2 communication
-  int nswap = cb->nswap;
-  int* sendnum = cb->sendnum; // dim: nswap
-  int* recvnum = cb->recvnum; // dim: nswap
-  int* firstrecv = cb->firstrecv; // dim: nswap
-  int** sendlist = cb->sendlist; // dim: nswap x sendnum[nswap]
-  int* sendproc = cb->sendproc; // dim: nswap
-  int* recvproc = cb->recvproc; // dim: nswap
-  int** pbc = cb->pbc; // dim: nswap x 3
-  int* maxneed = cb->maxneed; // dim: 3
-  auto* recvneed = cb->recvneed; // int array, dim: 3x2
-  auto* sendneed = cb->sendneed; // int array, dim: 3x2
+  deepmd_compat::CommData* commdata = new deepmd_compat::CommData(); 
+  commdata->nswap = cb->nswap;
+  commdata->sendnum = cb->sendnum; // dim: nswap
+  commdata->recvnum = cb->recvnum; // dim: nswap
+  commdata->firstrecv = cb->firstrecv; // dim: nswap
+  commdata->sendlist = cb->sendlist; // dim: nswap x sendnum[nswap]
+  commdata->sendproc = cb->sendproc; // dim: nswap
+  commdata->recvproc = cb->recvproc; // dim: nswap
+  commdata->world = reinterpret_cast<long*>(world);
   double* prd = domain->prd;
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
@@ -574,7 +562,10 @@ void PairDeepMD::compute(int eflag, int vflag) {
       (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
     deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
-                                       list->firstneigh);
+                                       list->firstneigh,commdata);
+    // else
+    // deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
+    //                                    list->firstneigh);
     deepmd_compat::InputNlist extend_lmp_list;
     if (atom->sp_flag) {
       extend(extend_inum, extend_ilist, extend_numneigh, extend_neigh,

From 49f02049ea26813899fea1ffa8ba74a2e83b3b1e Mon Sep 17 00:00:00 2001
From: CaRoLZhangxy <ucaszxy@outlook.com>
Date: Tue, 19 Mar 2024 12:20:11 +0000
Subject: [PATCH 6/7] finish cpu pass

---
 deepmd/pt/entrypoints/main.py                 |   3 +-
 .../model/atomic_model/base_atomic_model.py   |   2 +
 .../pt/model/atomic_model/dp_atomic_model.py  |   2 +
 deepmd/pt/model/descriptor/dpa2.py            |  20 +-
 deepmd/pt/model/descriptor/repformers.py      |  41 ++-
 deepmd/pt/model/descriptor/se_a.py            |   1 +
 deepmd/pt/model/model/ener_model.py           |   6 +-
 deepmd/pt/model/model/make_model.py           |   4 +-
 deepmd/pt/utils/env.py                        |   9 +-
 source/CMakeLists.txt                         |   4 -
 source/api_c/include/c_api.h                  |   2 +-
 source/api_c/include/deepmd.hpp               |  42 ++--
 source/api_c/src/c_api.cc                     |   5 +-
 source/api_cc/include/DeepPotPT.h             |   1 +
 source/api_cc/src/DeepPotPT.cc                |  36 ++-
 source/lib/include/neighbor_list.h            | 115 ++++++---
 source/lmp/builtin.cmake                      |   1 +
 source/lmp/pair_deepmd.cpp                    |  25 +-
 source/lmp/pair_deepmd.h                      |   2 +-
 source/lmp/plugin/CMakeLists.txt              |   1 +
 source/op_pt/CMakeLists.txt                   |  39 ++-
 source/op_pt/comm.cc                          | 233 ++++++++++++++++++
 22 files changed, 473 insertions(+), 121 deletions(-)
 create mode 100644 source/op_pt/comm.cc

diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 46d284a395..3ef91c76d6 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -46,6 +46,7 @@
 )
 from deepmd.pt.utils.env import (
     DEVICE,
+    load_op
 )
 from deepmd.pt.utils.finetune import (
     change_finetune_model_params,
@@ -301,7 +302,7 @@ def main(args: Optional[Union[List[str], argparse.Namespace]] = None):
     set_log_handles(FLAGS.log_level, FLAGS.log_path, mpi_log=None)
     log.debug("Log handles were successfully set")
     log.info("DeepMD version: %s", __version__)
-
+    load_op()
     if FLAGS.command == "train":
         train(FLAGS)
     elif FLAGS.command == "freeze":
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index 8180c48c81..877f0c5d6b 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -91,6 +91,7 @@ def forward_common_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ) -> Dict[str, torch.Tensor]:
         _, nloc, _ = nlist.shape
         atype = extended_atype[:, :nloc]
@@ -107,6 +108,7 @@ def forward_common_atomic(
             mapping=mapping,
             fparam=fparam,
             aparam=aparam,
+            comm_dict=comm_dict
         )
 
         if self.atom_excl is not None:
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index cad1e1cc88..7e369b7334 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -134,6 +134,7 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ) -> Dict[str, torch.Tensor]:
         """Return atomic prediction.
 
@@ -167,6 +168,7 @@ def forward_atomic(
             extended_atype,
             nlist,
             mapping=mapping,
+            comm_dict=comm_dict
         )
         assert descriptor is not None
         # energy, force
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index fb792a51e2..69ae514afb 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -5,6 +5,7 @@
     Optional,
     Tuple,
     Union,
+    Dict,
 )
 
 import torch
@@ -21,6 +22,10 @@
 from deepmd.pt.utils.update_sel import (
     UpdateSel,
 )
+
+from deepmd.pt.utils.env import(
+    load_op
+)
 from deepmd.utils.path import (
     DPPath,
 )
@@ -395,6 +400,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ):
         """Compute the descriptor.
 
@@ -450,11 +456,12 @@ def forward(
         # linear to change shape
         g1 = self.g1_shape_tranform(g1)
         # mapping g1
-        assert mapping is not None
-        mapping_ext = (
-            mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, g1.shape[-1])
-        )
-        g1_ext = torch.gather(g1, 1, mapping_ext)
+        if(comm_dict is None):
+            assert mapping is not None
+            # mapping_ext = (
+            #     mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, g1.shape[-1])
+            # )
+            # g1_ext = torch.gather(g1, 1, mapping_ext)
         # repformer
         g1, g2, h2, rot_mat, sw = self.repformers(
             nlist_dict[
@@ -464,8 +471,9 @@ def forward(
             ],
             extended_coord,
             extended_atype,
-            g1_ext,
+            g1,
             mapping,
+            comm_dict
         )
         if self.concat_output_tebd:
             g1 = torch.cat([g1, g1_inp], dim=-1)
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index a908d2e057..cc48b4cbc9 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -226,7 +226,7 @@ def reinit_exclude(
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
-
+    @torch.jit.script_method
     def forward(
         self,
         nlist: torch.Tensor,
@@ -234,9 +234,10 @@ def forward(
         extended_atype: torch.Tensor,
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ):
-        assert mapping is not None
-        assert extended_atype_embd is not None
+        if comm_dict is None:
+            assert extended_atype_embd is not None
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.view(nframes, -1).shape[1] // 3
         atype = extended_atype[:, :nloc]
@@ -257,8 +258,12 @@ def forward(
         sw = sw.masked_fill(~nlist_mask, 0.0)
 
         # [nframes, nloc, tebd_dim]
-        atype_embd = extended_atype_embd[:, :nloc, :]
-        assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
+        #atype_embd = extended_atype_embd[:, :nloc, :]
+        atype_embd = extended_atype_embd
+        if atype_embd is not None:
+            assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
+        else:
+            raise NotImplementedError
 
         g1 = self.act(atype_embd)
         # nb x nloc x nnei x 1,  nb x nloc x nnei x 3
@@ -275,11 +280,33 @@ def forward(
         # if the a neighbor is real or not is indicated by nlist_mask
         nlist[nlist == -1] = 0
         # nb x nall x ng1
-        mapping = mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
+        if comm_dict is None:
+            assert mapping is not None
+            mapping = mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
         for idx, ll in enumerate(self.layers):
             # g1:     nb x nloc x ng1
             # g1_ext: nb x nall x ng1
-            g1_ext = torch.gather(g1, 1, mapping)
+            if comm_dict is None:
+                assert mapping is not None
+                g1_ext = torch.gather(g1, 1, mapping)
+            else:
+                # padding = torch.zeros(nall-nloc, g1.size(2),device=mydev)
+                # g1 = torch.cat((g1.squeeze(0), padding), dim=0)
+                n_padding = nall -nloc
+                g1 = torch.nn.functional.pad(g1.squeeze(0), (0, 0, 0, n_padding), value=0.0)
+                assert 'send_list' in comm_dict
+                assert 'send_proc' in comm_dict
+                assert 'recv_proc' in comm_dict
+                assert 'send_num'  in comm_dict
+                assert 'recv_num'  in comm_dict
+                assert 'communicator' in comm_dict
+                ret = env.op_module.border_op(comm_dict['send_list'],
+                                comm_dict['send_proc'], comm_dict['recv_proc'],
+                                comm_dict['send_num'], comm_dict['recv_num'],
+                                g1,
+                                comm_dict['communicator'],torch.tensor(nloc),torch.tensor(nall-nloc))
+                g1_ext = ret[0].unsqueeze(0) 
+            
             g1, g2, h2 = ll.forward(
                 g1_ext,
                 g2,
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index e17b7c5d54..a2c505ad81 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -191,6 +191,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ):
         """Compute the descriptor.
 
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index cd4f78a2e2..c702f01bf3 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -9,7 +9,9 @@
 from .dp_model import (
     DPModel,
 )
-
+from deepmd.pt.utils.env import (
+    load_op
+)
 
 class EnergyModel(DPModel):
     model_type = "ener"
@@ -69,6 +71,7 @@ def forward_lower(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
@@ -78,6 +81,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict
         )
         if self.fitting_net is not None:
             model_predict = {}
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index f9daa916a8..fd8c601569 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -147,7 +147,7 @@ def forward_common(
                 mapping,
                 do_atomic_virial=do_atomic_virial,
                 fparam=fp,
-                aparam=ap,
+                aparam=ap
             )
             model_predict = communicate_extended_output(
                 model_predict_lower,
@@ -167,6 +167,7 @@ def forward_common_lower(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
+            comm_dict: Optional[Dict[str, torch.Tensor]] = None
         ):
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
@@ -210,6 +211,7 @@ def forward_common_lower(
                 mapping=mapping,
                 fparam=fp,
                 aparam=ap,
+                comm_dict = comm_dict
             )
             model_predict = fit_output_to_model_output(
                 atomic_ret,
diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
index c9b54086f6..7f41097856 100644
--- a/deepmd/pt/utils/env.py
+++ b/deepmd/pt/utils/env.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
-
+from typing import (
+   Any,
+)
 import numpy as np
 import torch
 
@@ -80,6 +82,7 @@
     "ENERGY_BIAS_TRAINABLE",
     "LOCAL_RANK",
 ]
-
 def load_op():
-    torch.ops.load_library("/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/dp/lib/")
\ No newline at end of file
+    torch.ops.load_library("/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so")
+
+op_module: Any=torch.ops.my_ops
\ No newline at end of file
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 5c8d8b1260..931013016d 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -219,7 +219,6 @@ endif()
 # define names of libs
 set(LIB_DEEPMD "deepmd")
 set(LIB_DEEPMD_OP "deepmd_op")
-set(LIB_DEEPMD_OP_PT "deepmd_op_pt")
 if(BUILD_CPP_IF)
   set(LIB_DEEPMD_CC "deepmd_cc")
   set(LIB_DEEPMD_C "deepmd_c")
@@ -282,9 +281,6 @@ if(NOT DEEPMD_C_ROOT)
   if(ENABLE_TENSORFLOW)
     add_subdirectory(op/)
   endif()
-  if(ENABLE_PYTORCH)
-    add_subdirectory(op_pt/)
-  endif()
   add_subdirectory(lib/)
 endif()
 if(BUILD_PY_IF)
diff --git a/source/api_c/include/c_api.h b/source/api_c/include/c_api.h
index ea8e933f70..254890dda8 100644
--- a/source/api_c/include/c_api.h
+++ b/source/api_c/include/c_api.h
@@ -36,7 +36,7 @@ extern DP_Nlist* DP_NewNlist_comm(int inum_,
                                   int** sendlist,
                                   int* sendproc,
                                   int* recvproc,
-                                  long int* world);
+                                  int world);
 
 /**
  * @brief Delete a neighbor list.
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index dc03dc393c..cca8dd3868 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -502,20 +502,20 @@ inline double *_DP_Get_Energy_Pointer(double &vec, const int nframes) {
 
 namespace deepmd {
 namespace hpp {
-    struct CommData {
-    int nswap;
-    int* sendnum;
-    int* recvnum;
-    int* firstrecv;
-    int** sendlist;
-    int* sendproc;
-    int* recvproc;
-    long int* world;
-
-    CommData() : nswap(0), sendnum(nullptr), recvnum(nullptr),
-                 firstrecv(nullptr), sendlist(nullptr),
-                 sendproc(nullptr), recvproc(nullptr),world(nullptr) {}
-};
+//     struct CommData {
+//     int nswap;
+//     int* sendnum;
+//     int* recvnum;
+//     int* firstrecv;
+//     int** sendlist;
+//     int* sendproc;
+//     int* recvproc;
+//     long int* world;
+
+//     CommData() : nswap(0), sendnum(nullptr), recvnum(nullptr),
+//                  firstrecv(nullptr), sendlist(nullptr),
+//                  sendproc(nullptr), recvproc(nullptr),world(nullptr) {}
+// };
 /**
  * @brief Neighbor list.
  **/
@@ -536,13 +536,20 @@ struct InputNlist {
         nl(DP_NewNlist(inum_, ilist_, numneigh_, firstneigh_)) {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
-  InputNlist(int inum_, int *ilist_, int *numneigh_, int **firstneigh_, CommData *commdata_)
+  InputNlist(int inum_, int *ilist_, int *numneigh_, int **firstneigh_, int nswap,
+                           int* sendnum,
+                           int* recvnum,
+                           int* firstrecv,
+                           int** sendlist,
+                           int* sendproc,
+                           int* recvproc,
+                           int world)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
         firstneigh(firstneigh_),
-        nl(DP_NewNlist_comm(inum_, ilist_, numneigh_, firstneigh_,commdata_->nswap,commdata_->sendnum,commdata_->recvnum,commdata_->firstrecv,commdata_->sendlist,commdata_->sendproc,commdata_->recvproc,commdata_->world)) {
-    DP_CHECK_OK(DP_NlistCheckOK, nl);
+        nl(DP_NewNlist_comm(inum_, ilist_, numneigh_, firstneigh_,nswap,sendnum,recvnum,firstrecv,sendlist,sendproc,recvproc,world)) {
+    //DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
   ~InputNlist() { DP_DeleteNlist(nl); };
   /// @brief C API neighbor list.
@@ -822,7 +829,6 @@ class DeepPot {
                        aparam);
     const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
     const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
-
     _DP_DeepPotComputeNList<VALUETYPE>(
         dp, nframes, natoms, coord_, atype_, box_, nghost, lmp_list.nl, ago,
         fparam__, aparam__, ener_, force_, virial_, nullptr, nullptr);
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index b74e24a2f4..48690cfd4f 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -35,10 +35,9 @@ DP_Nlist* DP_NewNlist_comm(int inum_,
                            int** sendlist,
                            int* sendproc,
                            int* recvproc,
-                           long int* world) {
-  deepmd::CommData commdata(nswap, sendnum, recvnum, firstrecv, sendlist,
+                           int world) {
+  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_,nswap, sendnum, recvnum, firstrecv, sendlist,
                             sendproc, recvproc, world);
-  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_, &commdata);
   DP_Nlist* new_nl = new DP_Nlist(nl);
   return new_nl;
 }
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index a7fc910b46..0b61d1a2a2 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -327,6 +327,7 @@ class DeepPotPT : public DeepPotBase {
   int gpu_id;
   bool gpu_enabled;
   at::Tensor firstneigh_tensor;
+  torch::Dict<std::string, torch::Tensor> comm_dict;
 };
 
 }  // namespace deepmd
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 2c3fd1d865..10ec6a4a58 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -105,8 +105,10 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
     options = torch::TensorOptions().dtype(torch::kFloat32);
     floatType = torch::kFloat32;
   }
-  auto int_options = torch::TensorOptions().dtype(torch::kInt64);
-  auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+  auto int32_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt32);
+  auto int_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64);
 
   // select real atoms
   std::vector<VALUETYPE> dcoord, dforce, aparam_, datom_energy, datom_virial;
@@ -122,11 +124,37 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
           .to(device);
   std::vector<int64_t> atype_64(datype.begin(), datype.end());
   at::Tensor atype_Tensor =
-      torch::from_blob(atype_64.data(), {1, nall_real}, int_options).to(device);
+      torch::from_blob(atype_64.data(), {1, nall_real}, int_option).to(device);
   if (ago == 0) {
     nlist_data.copy_from_nlist(lmp_list);
     nlist_data.shuffle_exclude_empty(fwd_map);
     nlist_data.padding();
+
+    int nswap = lmp_list.nswap;
+    torch::Tensor sendproc_tensor =
+        torch::from_blob(lmp_list.sendproc, {nswap}, int32_option);
+    torch::Tensor recvproc_tensor =
+        torch::from_blob(lmp_list.recvproc, {nswap}, int32_option);
+    torch::Tensor firstrecv_tensor =
+        torch::from_blob(lmp_list.firstrecv, {nswap}, int32_option);
+    torch::Tensor recvnum_tensor =
+        torch::from_blob(lmp_list.recvnum, {nswap}, int32_option);
+    torch::Tensor sendnum_tensor =
+        torch::from_blob(lmp_list.sendnum, {nswap}, int32_option);
+    // torch::Tensor communicator_tensor =
+    //     torch::from_blob(lmp_list.commdata->world, {1}, int_option);
+    torch::Tensor communicator_tensor = torch::tensor(lmp_list.world,int32_option);
+    torch::Tensor nswap_tensor = torch::tensor(nswap, int32_option);
+    int total_send = std::accumulate(lmp_list.sendnum,lmp_list.sendnum+nswap,0);
+    torch::Tensor sendlist_tensor = torch::from_blob(
+        lmp_list.sendlist, {total_send}, int32_option);
+
+    comm_dict.insert("send_list",sendlist_tensor);
+    comm_dict.insert("send_proc",sendproc_tensor);
+    comm_dict.insert("recv_proc",recvproc_tensor);
+    comm_dict.insert("send_num",sendnum_tensor);
+    comm_dict.insert("recv_num",recvnum_tensor);
+    comm_dict.insert("communicator",communicator_tensor);
   }
   at::Tensor firstneigh = createNlistTensor(nlist_data.jlist);
   firstneigh_tensor = firstneigh.to(torch::kInt64).to(device);
@@ -152,7 +180,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
       module
           .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
                       firstneigh_tensor, optional_tensor, fparam_tensor,
-                      aparam_tensor, do_atom_virial_tensor)
+                      aparam_tensor, do_atom_virial_tensor,comm_dict)
           .toGenericDict();
   c10::IValue energy_ = outputs.at("energy");
   c10::IValue force_ = outputs.at("extended_force");
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 45101e3fdc..39c88c13ef 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -11,41 +11,42 @@
 #include "utilities.h"
 
 namespace deepmd {
-struct CommData {
-  int nswap;
-  int* sendnum;
-  int* recvnum;
-  int* firstrecv;
-  int** sendlist;
-  int* sendproc;
-  int* recvproc;
-  long int* world;
+// struct CommData {
+//   int nswap;
+//   int* sendnum;
+//   int* recvnum;
+//   int* firstrecv;
+//   int** sendlist;
+//   int* sendproc;
+//   int* recvproc;
+//   long int world;
 
-  CommData()
-      : nswap(0),
-        sendnum(nullptr),
-        recvnum(nullptr),
-        firstrecv(nullptr),
-        sendlist(nullptr),
-        sendproc(nullptr),
-        recvproc(nullptr),
-        world(nullptr){};
-  CommData(int nswap,
-           int* sendnum,
-           int* recvnum,
-           int* firstrecv,
-           int** sendlist,
-           int* sendproc,
-           int* recvproc,
-           long int* world)
-      : nswap(nswap),
-        sendnum(sendnum),
-        recvnum(recvnum),
-        firstrecv(firstrecv),
-        sendlist(sendlist),
-        recvproc(recvproc),
-        world(world) {}
-};
+//   CommData()
+//       : nswap(0),
+//         sendnum(nullptr),
+//         recvnum(nullptr),
+//         firstrecv(nullptr),
+//         sendlist(nullptr),
+//         sendproc(nullptr),
+//         recvproc(nullptr),
+//         world(0){};
+//   CommData(int nswap,
+//            int* sendnum,
+//            int* recvnum,
+//            int* firstrecv,
+//            int** sendlist,
+//            int* sendproc,
+//            int* recvproc,
+//            long int world)
+//       : nswap(nswap),
+//         sendnum(sendnum),
+//         recvnum(recvnum),
+//         firstrecv(firstrecv),
+//         sendlist(sendlist),
+//         sendproc(sendproc),
+//         recvproc(recvproc),
+//         world(world) {}
+// };
 
 /**
  * @brief             Construct InputNlist with the input LAMMPS nbor list info.
@@ -61,29 +62,65 @@ struct InputNlist {
   int* numneigh;
   /// Array stores the core region atom's neighbor index
   int** firstneigh;
-  CommData* commdata;
+
+  int nswap;
+  int* sendnum;
+  int* recvnum;
+  int* firstrecv;
+  int** sendlist;
+  int* sendproc;
+  int* recvproc;
+  int world;
   InputNlist()
       : inum(0),
         ilist(NULL),
         numneigh(NULL),
         firstneigh(NULL),
-        commdata(NULL){};
+        nswap(0),
+        sendnum(nullptr),
+        recvnum(nullptr),
+        firstrecv(nullptr),
+        sendlist(nullptr),
+        sendproc(nullptr),
+        recvproc(nullptr),
+        world(0){};
   InputNlist(int inum_, int* ilist_, int* numneigh_, int** firstneigh_)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
         firstneigh(firstneigh_),
-        commdata(NULL){};
+        nswap(0),
+        sendnum(nullptr),
+        recvnum(nullptr),
+        firstrecv(nullptr),
+        sendlist(nullptr),
+        sendproc(nullptr),
+        recvproc(nullptr),
+        world(0){};
   InputNlist(int inum_,
              int* ilist_,
              int* numneigh_,
              int** firstneigh_,
-             CommData* commdata_)
+             int nswap,
+           int* sendnum,
+           int* recvnum,
+           int* firstrecv,
+           int** sendlist,
+           int* sendproc,
+           int* recvproc,
+           int world)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
         firstneigh(firstneigh_),
-        commdata(commdata_){};
+        nswap(nswap),
+        sendnum(sendnum),
+        recvnum(recvnum),
+        firstrecv(firstrecv),
+        sendlist(sendlist),
+        sendproc(sendproc),
+        recvproc(recvproc),
+        world(world) {};
   ~InputNlist(){};
 };
 
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index f29e9d3319..05372f9452 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -24,6 +24,7 @@ target_sources(
           ${LAMMPS_SOURCE_DIR}/EXTRA-FIX/fix_ttm.cpp # for ttm
 )
 target_link_libraries(lammps PUBLIC DeePMD::deepmd_c)
+target_link_libraries(lammps PUBLIC -Wl,--no-as-needed "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
 target_include_directories(
   lammps PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}
                  ${LAMMPS_SOURCE_DIR}/KSPACE ${LAMMPS_SOURCE_DIR}/EXTRA-FIX)
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index 052b9d1b67..a30e420387 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -6,6 +6,7 @@
 #include <limits>
 #include <map>
 #include <sstream>
+#include <cassert>
 
 #include "atom.h"
 #include "citeme.h"
@@ -472,15 +473,17 @@ void PairDeepMD::compute(int eflag, int vflag) {
   int newton_pair = force->newton_pair;
 
   // for dpa2 communication
-  deepmd_compat::CommData* commdata = new deepmd_compat::CommData(); 
-  commdata->nswap = cb->nswap;
-  commdata->sendnum = cb->sendnum; // dim: nswap
-  commdata->recvnum = cb->recvnum; // dim: nswap
-  commdata->firstrecv = cb->firstrecv; // dim: nswap
-  commdata->sendlist = cb->sendlist; // dim: nswap x sendnum[nswap]
-  commdata->sendproc = cb->sendproc; // dim: nswap
-  commdata->recvproc = cb->recvproc; // dim: nswap
-  commdata->world = reinterpret_cast<long*>(world);
+  // deepmd_compat::CommData* commdata = new deepmd_compat::CommData(); 
+  // commdata->nswap = cb->nswap;
+  // commdata->sendnum = cb->sendnum; // dim: nswap
+  // commdata->recvnum = cb->recvnum; // dim: nswap
+  // commdata->firstrecv = cb->firstrecv; // dim: nswap
+  // commdata->sendlist = cb->sendlist; // dim: nswap x sendnum[nswap]
+  // commdata->sendproc = cb->sendproc; // dim: nswap
+  // commdata->recvproc = cb->recvproc; // dim: nswap
+  assert(sizeof(MPI_Comm) == sizeof(int));
+  //std::cout<<"world:"<<world<<std::endl;
+  int world_int = world;
   double* prd = domain->prd;
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
@@ -562,7 +565,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
       (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
     deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
-                                       list->firstneigh,commdata);
+                                       list->firstneigh,commdata_->nswap,commdata_->sendnum,commdata_->recvnum,commdata_->firstrecv,commdata_->sendlist,commdata_->sendproc,commdata_->recvproc,world_int);
     // else
     // deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
     //                                    list->firstneigh);
@@ -1291,7 +1294,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
   }
 
   //dpa2 communication
-  cb = (CommBrickDeepMD *)comm;
+  commdata_ = (CommBrickDeepMD *)comm;
 }
 
 void PairDeepMD::init_style() {
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index 55ddb515de..9255e36c6d 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -141,7 +141,7 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
-  CommBrickDeepMD* cb;
+  CommBrickDeepMD* commdata_;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
index bfc2253412..7cf9c829fa 100644
--- a/source/lmp/plugin/CMakeLists.txt
+++ b/source/lmp/plugin/CMakeLists.txt
@@ -85,6 +85,7 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
     target_compile_definitions(${libname} PUBLIC "DP_USE_CXX_API")
   endif()
   target_link_libraries(${libname} PUBLIC lammps_interface)
+  target_link_libraries(${libname}  PUBLIC -Wl,--no-as-needed "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
   target_include_directories(
     ${libname}
     PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/..
diff --git a/source/op_pt/CMakeLists.txt b/source/op_pt/CMakeLists.txt
index d24cfc763f..90c79f1576 100644
--- a/source/op_pt/CMakeLists.txt
+++ b/source/op_pt/CMakeLists.txt
@@ -1,25 +1,22 @@
 # libop
+cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
+project(op)
+set(GLIBCXX_USE_CXX11_ABI 1)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-file(
-  GLOB
-  OP_PT_SRC
-  comm.cc
-)
-find_package(MPI REQUIRED)
-
-add_library(${LIB_DEEPMD_OP_PT} MODULE ${OP_PT_SRC})
-target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE "${TORCH_LIBRARIES}")
-target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE MPI::MPI_CXX)
-target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE ${LIB_DEEPMD})
-
-if(CMAKE_TESTING_ENABLED)
-  target_link_libraries(${LIB_DEEPMD_OP_PT} PRIVATE coverage_config)
+find_package(Python REQUIRED COMPONENTS Development)
+find_package(Torch REQUIRED)
+#find_package(MPI REQUIRED)
+find_package(CUDA REQUIRED)
+if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 80)
+endif ()
+option(USE_CUDA "GPU_SUPPORT" ON)
+if(USE_CUDA)
+  add_definitions(-DUSE_CUDA)
 endif()
 
-
-
-if(BUILD_PY_IF)
-  install(TARGETS ${LIB_DEEPMD_OP_PT} DESTINATION deepmd/lib/)
-else(BUILD_PY_IF)
-  install(TARGETS ${LIB_DEEPMD_OP_PT} DESTINATION lib/)
-endif(BUILD_PY_IF)
+add_library(op_pt SHARED comm.cc)
+target_include_directories(op_pt PRIVATE ${Python_INCLUDE_DIRS})
+target_link_libraries(op_pt "${TORCH_LIBRARIES}" "${Python_LIBRARIES}" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
diff --git a/source/op_pt/comm.cc b/source/op_pt/comm.cc
new file mode 100644
index 0000000000..69be4be331
--- /dev/null
+++ b/source/op_pt/comm.cc
@@ -0,0 +1,233 @@
+#include "custom_op.h"
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+  template <typename T>
+  static MPI_Datatype get_mpi_type();
+
+  template <>
+  MPI_Datatype get_mpi_type<float>() {
+    return MPI_FLOAT;
+  }
+
+  template <>
+  MPI_Datatype get_mpi_type<double>() {
+    return MPI_DOUBLE;
+  }
+
+class Border : public torch::autograd::Function<Border> {
+ public:
+  static torch::autograd::variable_list forward(
+      torch::autograd::AutogradContext* ctx,
+      const torch::Tensor& sendlist_tensor,
+      const torch::Tensor& sendproc_tensor,
+      const torch::Tensor& recvproc_tensor,
+      const torch::Tensor& sendnum_tensor,
+      const torch::Tensor& recvnum_tensor,
+      const torch::Tensor& g1,
+      const torch::Tensor& communicator_tensor,
+      const torch::Tensor& nlocal_tensor,
+      const torch::Tensor& nghost_tensor) {
+    using FPTYPE = double;
+    ctx->save_for_backward({sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                            sendnum_tensor, recvnum_tensor, communicator_tensor,
+                            nlocal_tensor, nghost_tensor});
+    int** sendlist = reinterpret_cast<int**>(sendlist_tensor.data_ptr());
+    int* sendproc = sendproc_tensor.data_ptr<int>();
+    int* recvproc = recvproc_tensor.data_ptr<int>();
+    int* sendnum = sendnum_tensor.data_ptr<int>();
+    int* recvnum = recvnum_tensor.data_ptr<int>();
+    int tensor_size = g1.size(1);
+    int nswap = sendproc_tensor.size(0);
+
+    int nlocal = nlocal_tensor.item<int>();
+    int nghost = nghost_tensor.item<int>();
+    int ntotal = nlocal + nghost;
+    torch::Tensor recv_g1_tensor = g1;
+
+    FPTYPE* recv_g1 =
+        recv_g1_tensor.data_ptr<FPTYPE>() + nlocal * tensor_size;
+
+    int me;
+    MPI_Comm_rank(MPI_COMM_WORLD, &me);
+    MPI_Comm world;
+    unpack_communicator(communicator_tensor, world);
+    MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
+    MPI_Request request;
+    auto int32_options = torch::TensorOptions()
+                             .dtype(torch::kInt32);
+    std::cout<<"nswap: "<<nswap<<std::endl;
+    for (int iswap = 0; iswap < nswap; ++iswap) {
+      std::cout<<"num"<<iswap<<std::endl;
+      int nrecv = recvnum[iswap];
+      int nsend = sendnum[iswap];
+      torch::Tensor isendlist =
+          torch::from_blob(sendlist[iswap], {nsend}, int32_options).to(recv_g1_tensor.device());
+      torch::Tensor send_g1_tensor =
+          recv_g1_tensor.index_select(0, isendlist);
+      FPTYPE* send_g1 = send_g1_tensor.data_ptr<FPTYPE>();
+      if (sendproc[iswap] != me) {
+        if (nrecv) {
+          std::cout<<"recv"<<std::endl;
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type,
+                    recvproc[iswap], 0, world, &request);
+        }
+        if (nsend) {
+          std::cout<<"send"<<std::endl;
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap],
+                   0, world);
+        }
+        if (nrecv) {
+          std::cout<<"wait"<<std::endl;
+          MPI_Wait(&request, MPI_STATUS_IGNORE);
+        }
+      } else {
+#ifdef USE_CUDA
+        cudaMemcpy(recv_g1, send_g1,
+                         nsend * tensor_size * sizeof(FPTYPE), cudaMemcpyDeviceToDevice);
+#else
+        memcpy(recv_g1, send_g1,
+               nsend * tensor_size * sizeof(FPTYPE));
+#endif
+      }
+      recv_g1 += nrecv * tensor_size;
+    }
+
+    return {recv_g1_tensor};
+  }
+  static torch::autograd::variable_list backward(
+      torch::autograd::AutogradContext* ctx,
+      torch::autograd::variable_list grad_output) {
+#ifdef USE_CUDA
+    cudaDeviceSynchronize();
+#endif
+    using FPTYPE = double;
+    torch::autograd::variable_list saved_variables = ctx->get_saved_variables();
+    torch::Tensor sendlist_tensor = saved_variables[0];
+    torch::Tensor sendproc_tensor = saved_variables[1];
+    torch::Tensor recvproc_tensor = saved_variables[2];
+    torch::Tensor sendnum_tensor = saved_variables[3];
+    torch::Tensor recvnum_tensor = saved_variables[4];
+    torch::Tensor communicator_tensor = saved_variables[5];
+    torch::Tensor nlocal_tensor = saved_variables[6];
+    torch::Tensor nghost_tensor = saved_variables[7];
+
+    torch::Tensor d_local_g1_tensor = grad_output[0];
+
+    int** recvlist = reinterpret_cast<int**>(sendlist_tensor.data_ptr());
+    // swap send and recv here
+    int* recvproc = sendproc_tensor.data_ptr<int>();
+    int* sendproc = recvproc_tensor.data_ptr<int>();
+    int* recvnum = sendnum_tensor.data_ptr<int>();
+    int* sendnum = recvnum_tensor.data_ptr<int>();
+
+    FPTYPE* local_g1 = d_local_g1_tensor.data_ptr<FPTYPE>();
+    int tensor_size = d_local_g1_tensor.size(1);
+    int nswap = sendproc_tensor.size(0);
+
+    int nlocal = nlocal_tensor.item<int>();
+    int nghost = nghost_tensor.item<int>();
+    int ntotal = nlocal + nghost;
+
+    torch::Tensor send_g1_tensor = d_local_g1_tensor;
+
+    int max_recvnum = sendnum_tensor.max().item<int>();
+    auto options = torch::TensorOptions()
+                       .dtype(torch::kFloat64)
+                       .device(d_local_g1_tensor.device());
+    torch::Tensor recv_g1_tensor =
+        torch::empty({max_recvnum, tensor_size}, options);
+    FPTYPE* recv_g1 = recv_g1_tensor.data_ptr<FPTYPE>();
+    FPTYPE* send_g1 =
+        send_g1_tensor.data_ptr<FPTYPE>() + ntotal * tensor_size;
+
+    MPI_Comm world;
+    unpack_communicator(communicator_tensor, world);
+    int me;
+    MPI_Comm_rank(world, &me);
+    MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
+    MPI_Request request;
+
+    std::string msg;
+
+
+    int end = ntotal;
+    auto int32_options = torch::TensorOptions()
+                             .dtype(torch::kInt32);
+    std::cout<<"nswap backward"<<nswap<<std::endl;
+    for (int iswap = nswap - 1; iswap >= 0; --iswap) {
+
+
+      int nrecv = recvnum[iswap];
+      int nsend = sendnum[iswap];
+
+      torch::Tensor irecvlist;
+      if (nrecv) {
+        irecvlist = torch::from_blob(recvlist[iswap], {nrecv}, int32_options).to(d_local_g1_tensor.device());
+      }
+      if (nsend) {
+        send_g1 -= nsend * tensor_size;
+      }
+      if (sendproc[iswap] != me) {
+        if (nrecv) {
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type,
+                    recvproc[iswap], 0, world, &request);
+        }
+        if (nsend) {
+
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap],
+                   0, world);
+        }
+        if (nrecv) {
+
+          MPI_Wait(&request, MPI_STATUS_IGNORE);
+
+        }
+      } else {
+        if (nrecv) {
+
+#ifdef USE_CUDA
+          cudaMemcpy(recv_g1, send_g1,
+                         nrecv * tensor_size * sizeof(FPTYPE), cudaMemcpyDeviceToDevice);
+#else
+          memcpy(recv_g1, send_g1,
+                 nrecv * tensor_size * sizeof(FPTYPE));
+#endif
+
+        }
+      }
+      if (nrecv) {
+        d_local_g1_tensor.index_add_(
+            0, irecvlist, recv_g1_tensor.slice(0, 0, nrecv));
+      }
+
+    }
+#ifdef USE_CUDA
+    cudaDeviceSynchronize();
+#endif
+
+    return {torch::Tensor(),       torch::Tensor(), torch::Tensor(),
+            torch::Tensor(),       torch::Tensor(), d_local_g1_tensor,
+            torch::Tensor(), torch::Tensor(),
+            torch::Tensor(),       torch::Tensor()};
+  }
+  static void unpack_communicator(const torch::Tensor& communicator_tensor,
+                                  MPI_Comm& mpi_comm) {
+    int* communicator = communicator_tensor.data_ptr<int>();
+    mpi_comm = reinterpret_cast<MPI_Comm>(*communicator);
+  }
+};
+std::vector<torch::Tensor> border_op(const torch::Tensor& sendlist_tensor,
+                                     const torch::Tensor& sendproc_tensor,
+                                     const torch::Tensor& recvproc_tensor,
+                                     const torch::Tensor& sendnum_tensor,
+                                     const torch::Tensor& recvnum_tensor,
+                                     const torch::Tensor& g1_tensor,
+                                     const torch::Tensor& communicator_tensor,
+                                     const torch::Tensor& nlocal_tensor,
+                                     const torch::Tensor& nghost_tensor) {
+  return Border::apply(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                       sendnum_tensor, recvnum_tensor, g1_tensor,communicator_tensor, nlocal_tensor,
+                       nghost_tensor);
+}
+
+TORCH_LIBRARY_FRAGMENT(my_ops, m) { m.def("border_op", border_op); }

From c94d818f669488feb27563a815d3f59b7fe6649a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 21 Mar 2024 07:12:07 +0000
Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 deepmd/pt/entrypoints/main.py                 |   2 +-
 .../model/atomic_model/base_atomic_model.py   |   4 +-
 .../pt/model/atomic_model/dp_atomic_model.py  |   8 +-
 deepmd/pt/model/descriptor/dpa2.py            |  12 +-
 deepmd/pt/model/descriptor/repformers.py      |  47 +++++---
 deepmd/pt/model/descriptor/se_a.py            |   2 +-
 deepmd/pt/model/model/ener_model.py           |   9 +-
 deepmd/pt/model/model/make_model.py           |   6 +-
 deepmd/pt/utils/env.py                        |  12 +-
 source/api_c/include/deepmd.hpp               |  39 ++++--
 source/api_c/src/c_api.cc                     |   5 +-
 source/api_cc/src/DeepPotPT.cc                |  24 ++--
 source/lib/include/neighbor_list.h            |  16 +--
 source/lmp/builtin.cmake                      |   7 +-
 source/lmp/pair_deepmd.cpp                    |  22 ++--
 source/lmp/pair_deepmd.h                      |   4 +-
 source/lmp/plugin/CMakeLists.txt              |   7 +-
 source/op_pt/CMakeLists.txt                   |  11 +-
 source/op_pt/comm.cc                          | 114 ++++++++----------
 source/op_pt/custom_op.h                      |   3 +-
 20 files changed, 191 insertions(+), 163 deletions(-)

diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 824a5b4e2e..450b878083 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -46,7 +46,7 @@
 )
 from deepmd.pt.utils.env import (
     DEVICE,
-    load_op
+    load_op,
 )
 from deepmd.pt.utils.finetune import (
     change_finetune_model_params,
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index 877c9c2d4f..55a5797dab 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -105,7 +105,7 @@ def forward_common_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         """Common interface for atomic inference.
 
@@ -154,7 +154,7 @@ def forward_common_atomic(
             mapping=mapping,
             fparam=fparam,
             aparam=aparam,
-            comm_dict=comm_dict
+            comm_dict=comm_dict,
         )
 
         # nf x nloc
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 6b023217cb..1f6eb146cf 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -134,7 +134,7 @@ def forward_atomic(
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ) -> Dict[str, torch.Tensor]:
         """Return atomic prediction.
 
@@ -164,11 +164,7 @@ def forward_atomic(
         if self.do_grad_r() or self.do_grad_c():
             extended_coord.requires_grad_(True)
         descriptor, rot_mat, g2, h2, sw = self.descriptor(
-            extended_coord,
-            extended_atype,
-            nlist,
-            mapping=mapping,
-            comm_dict=comm_dict
+            extended_coord, extended_atype, nlist, mapping=mapping, comm_dict=comm_dict
         )
         assert descriptor is not None
         # energy, force
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index 69ae514afb..443f07fcd6 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -1,11 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
     Callable,
+    Dict,
     List,
     Optional,
     Tuple,
     Union,
-    Dict,
 )
 
 import torch
@@ -22,10 +22,6 @@
 from deepmd.pt.utils.update_sel import (
     UpdateSel,
 )
-
-from deepmd.pt.utils.env import(
-    load_op
-)
 from deepmd.utils.path import (
     DPPath,
 )
@@ -400,7 +396,7 @@ def forward(
         extended_atype: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -456,7 +452,7 @@ def forward(
         # linear to change shape
         g1 = self.g1_shape_tranform(g1)
         # mapping g1
-        if(comm_dict is None):
+        if comm_dict is None:
             assert mapping is not None
             # mapping_ext = (
             #     mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, g1.shape[-1])
@@ -473,7 +469,7 @@ def forward(
             extended_atype,
             g1,
             mapping,
-            comm_dict
+            comm_dict,
         )
         if self.concat_output_tebd:
             g1 = torch.cat([g1, g1_inp], dim=-1)
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index cc48b4cbc9..adaea32b8a 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -226,6 +226,7 @@ def reinit_exclude(
     ):
         self.exclude_types = exclude_types
         self.emask = PairExcludeMask(self.ntypes, exclude_types=exclude_types)
+
     @torch.jit.script_method
     def forward(
         self,
@@ -234,7 +235,7 @@ def forward(
         extended_atype: torch.Tensor,
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         if comm_dict is None:
             assert extended_atype_embd is not None
@@ -258,7 +259,7 @@ def forward(
         sw = sw.masked_fill(~nlist_mask, 0.0)
 
         # [nframes, nloc, tebd_dim]
-        #atype_embd = extended_atype_embd[:, :nloc, :]
+        # atype_embd = extended_atype_embd[:, :nloc, :]
         atype_embd = extended_atype_embd
         if atype_embd is not None:
             assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
@@ -282,7 +283,9 @@ def forward(
         # nb x nall x ng1
         if comm_dict is None:
             assert mapping is not None
-            mapping = mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
+            mapping = (
+                mapping.view(nframes, nall).unsqueeze(-1).expand(-1, -1, self.g1_dim)
+            )
         for idx, ll in enumerate(self.layers):
             # g1:     nb x nloc x ng1
             # g1_ext: nb x nall x ng1
@@ -292,21 +295,29 @@ def forward(
             else:
                 # padding = torch.zeros(nall-nloc, g1.size(2),device=mydev)
                 # g1 = torch.cat((g1.squeeze(0), padding), dim=0)
-                n_padding = nall -nloc
-                g1 = torch.nn.functional.pad(g1.squeeze(0), (0, 0, 0, n_padding), value=0.0)
-                assert 'send_list' in comm_dict
-                assert 'send_proc' in comm_dict
-                assert 'recv_proc' in comm_dict
-                assert 'send_num'  in comm_dict
-                assert 'recv_num'  in comm_dict
-                assert 'communicator' in comm_dict
-                ret = env.op_module.border_op(comm_dict['send_list'],
-                                comm_dict['send_proc'], comm_dict['recv_proc'],
-                                comm_dict['send_num'], comm_dict['recv_num'],
-                                g1,
-                                comm_dict['communicator'],torch.tensor(nloc),torch.tensor(nall-nloc))
-                g1_ext = ret[0].unsqueeze(0) 
-            
+                n_padding = nall - nloc
+                g1 = torch.nn.functional.pad(
+                    g1.squeeze(0), (0, 0, 0, n_padding), value=0.0
+                )
+                assert "send_list" in comm_dict
+                assert "send_proc" in comm_dict
+                assert "recv_proc" in comm_dict
+                assert "send_num" in comm_dict
+                assert "recv_num" in comm_dict
+                assert "communicator" in comm_dict
+                ret = env.op_module.border_op(
+                    comm_dict["send_list"],
+                    comm_dict["send_proc"],
+                    comm_dict["recv_proc"],
+                    comm_dict["send_num"],
+                    comm_dict["recv_num"],
+                    g1,
+                    comm_dict["communicator"],
+                    torch.tensor(nloc),
+                    torch.tensor(nall - nloc),
+                )
+                g1_ext = ret[0].unsqueeze(0)
+
             g1, g2, h2 = ll.forward(
                 g1_ext,
                 g2,
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index a2c505ad81..d77f7c336c 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -191,7 +191,7 @@ def forward(
         atype_ext: torch.Tensor,
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         """Compute the descriptor.
 
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index db5fff57f6..eba738271b 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -6,12 +6,11 @@
 
 import torch
 
+
 from .dp_model import (
     DPModel,
 )
-from deepmd.pt.utils.env import (
-    load_op
-)
+
 
 class EnergyModel(DPModel):
     model_type = "ener"
@@ -71,7 +70,7 @@ def forward_lower(
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-        comm_dict: Optional[Dict[str, torch.Tensor]] = None
+        comm_dict: Optional[Dict[str, torch.Tensor]] = None,
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
@@ -81,7 +80,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
-            comm_dict=comm_dict
+            comm_dict=comm_dict,
         )
         if self.get_fitting_net() is not None:
             model_predict = {}
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index b76b98921e..bc5847f00a 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -161,7 +161,7 @@ def forward_common(
                 mapping,
                 do_atomic_virial=do_atomic_virial,
                 fparam=fp,
-                aparam=ap
+                aparam=ap,
             )
             model_predict = communicate_extended_output(
                 model_predict_lower,
@@ -181,7 +181,7 @@ def forward_common_lower(
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
-            comm_dict: Optional[Dict[str, torch.Tensor]] = None
+            comm_dict: Optional[Dict[str, torch.Tensor]] = None,
         ):
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
@@ -225,7 +225,7 @@ def forward_common_lower(
                 mapping=mapping,
                 fparam=fp,
                 aparam=ap,
-                comm_dict = comm_dict
+                comm_dict=comm_dict,
             )
             model_predict = fit_output_to_model_output(
                 atomic_ret,
diff --git a/deepmd/pt/utils/env.py b/deepmd/pt/utils/env.py
index 7f41097856..ab9d546349 100644
--- a/deepmd/pt/utils/env.py
+++ b/deepmd/pt/utils/env.py
@@ -1,8 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
 from typing import (
-   Any,
+    Any,
 )
+
 import numpy as np
 import torch
 
@@ -82,7 +83,12 @@
     "ENERGY_BIAS_TRAINABLE",
     "LOCAL_RANK",
 ]
+
+
 def load_op():
-    torch.ops.load_library("/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so")
+    torch.ops.load_library(
+        "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so"
+    )
+
 
-op_module: Any=torch.ops.my_ops
\ No newline at end of file
+op_module: Any = torch.ops.my_ops
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index cca8dd3868..8d284bf9b8 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -536,21 +536,36 @@ struct InputNlist {
         nl(DP_NewNlist(inum_, ilist_, numneigh_, firstneigh_)) {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
-  InputNlist(int inum_, int *ilist_, int *numneigh_, int **firstneigh_, int nswap,
-                           int* sendnum,
-                           int* recvnum,
-                           int* firstrecv,
-                           int** sendlist,
-                           int* sendproc,
-                           int* recvproc,
-                           int world)
+  InputNlist(int inum_,
+             int *ilist_,
+             int *numneigh_,
+             int **firstneigh_,
+             int nswap,
+             int *sendnum,
+             int *recvnum,
+             int *firstrecv,
+             int **sendlist,
+             int *sendproc,
+             int *recvproc,
+             int world)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
         firstneigh(firstneigh_),
-        nl(DP_NewNlist_comm(inum_, ilist_, numneigh_, firstneigh_,nswap,sendnum,recvnum,firstrecv,sendlist,sendproc,recvproc,world)) {
-    //DP_CHECK_OK(DP_NlistCheckOK, nl);
-  };
+        nl(DP_NewNlist_comm(inum_,
+                            ilist_,
+                            numneigh_,
+                            firstneigh_,
+                            nswap,
+                            sendnum,
+                            recvnum,
+                            firstrecv,
+                            sendlist,
+                            sendproc,
+                            recvproc,
+                            world)){
+            // DP_CHECK_OK(DP_NlistCheckOK, nl);
+        };
   ~InputNlist() { DP_DeleteNlist(nl); };
   /// @brief C API neighbor list.
   DP_Nlist *nl;
@@ -564,8 +579,6 @@ struct InputNlist {
   int **firstneigh;
 };
 
-
-
 /**
  * @brief Convert pbtxt to pb.
  * @param[in] fn_pb_txt Filename of the pb txt file.
diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc
index 48690cfd4f..0d5c3694fc 100644
--- a/source/api_c/src/c_api.cc
+++ b/source/api_c/src/c_api.cc
@@ -36,8 +36,9 @@ DP_Nlist* DP_NewNlist_comm(int inum_,
                            int* sendproc,
                            int* recvproc,
                            int world) {
-  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_,nswap, sendnum, recvnum, firstrecv, sendlist,
-                            sendproc, recvproc, world);
+  deepmd::InputNlist nl(inum_, ilist_, numneigh_, firstneigh_, nswap, sendnum,
+                        recvnum, firstrecv, sendlist, sendproc, recvproc,
+                        world);
   DP_Nlist* new_nl = new DP_Nlist(nl);
   return new_nl;
 }
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 10ec6a4a58..70f1daaff1 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -143,18 +143,20 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
         torch::from_blob(lmp_list.sendnum, {nswap}, int32_option);
     // torch::Tensor communicator_tensor =
     //     torch::from_blob(lmp_list.commdata->world, {1}, int_option);
-    torch::Tensor communicator_tensor = torch::tensor(lmp_list.world,int32_option);
+    torch::Tensor communicator_tensor =
+        torch::tensor(lmp_list.world, int32_option);
     torch::Tensor nswap_tensor = torch::tensor(nswap, int32_option);
-    int total_send = std::accumulate(lmp_list.sendnum,lmp_list.sendnum+nswap,0);
-    torch::Tensor sendlist_tensor = torch::from_blob(
-        lmp_list.sendlist, {total_send}, int32_option);
+    int total_send =
+        std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
+    torch::Tensor sendlist_tensor =
+        torch::from_blob(lmp_list.sendlist, {total_send}, int32_option);
 
-    comm_dict.insert("send_list",sendlist_tensor);
-    comm_dict.insert("send_proc",sendproc_tensor);
-    comm_dict.insert("recv_proc",recvproc_tensor);
-    comm_dict.insert("send_num",sendnum_tensor);
-    comm_dict.insert("recv_num",recvnum_tensor);
-    comm_dict.insert("communicator",communicator_tensor);
+    comm_dict.insert("send_list", sendlist_tensor);
+    comm_dict.insert("send_proc", sendproc_tensor);
+    comm_dict.insert("recv_proc", recvproc_tensor);
+    comm_dict.insert("send_num", sendnum_tensor);
+    comm_dict.insert("recv_num", recvnum_tensor);
+    comm_dict.insert("communicator", communicator_tensor);
   }
   at::Tensor firstneigh = createNlistTensor(nlist_data.jlist);
   firstneigh_tensor = firstneigh.to(torch::kInt64).to(device);
@@ -180,7 +182,7 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
       module
           .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
                       firstneigh_tensor, optional_tensor, fparam_tensor,
-                      aparam_tensor, do_atom_virial_tensor,comm_dict)
+                      aparam_tensor, do_atom_virial_tensor, comm_dict)
           .toGenericDict();
   c10::IValue energy_ = outputs.at("energy");
   c10::IValue force_ = outputs.at("extended_force");
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 39c88c13ef..67e5ea80b1 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -102,13 +102,13 @@ struct InputNlist {
              int* numneigh_,
              int** firstneigh_,
              int nswap,
-           int* sendnum,
-           int* recvnum,
-           int* firstrecv,
-           int** sendlist,
-           int* sendproc,
-           int* recvproc,
-           int world)
+             int* sendnum,
+             int* recvnum,
+             int* firstrecv,
+             int** sendlist,
+             int* sendproc,
+             int* recvproc,
+             int world)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
@@ -120,7 +120,7 @@ struct InputNlist {
         sendlist(sendlist),
         sendproc(sendproc),
         recvproc(recvproc),
-        world(world) {};
+        world(world){};
   ~InputNlist(){};
 };
 
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index 05372f9452..3828a57f4c 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -24,7 +24,12 @@ target_sources(
           ${LAMMPS_SOURCE_DIR}/EXTRA-FIX/fix_ttm.cpp # for ttm
 )
 target_link_libraries(lammps PUBLIC DeePMD::deepmd_c)
-target_link_libraries(lammps PUBLIC -Wl,--no-as-needed "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
+target_link_libraries(
+  lammps
+  PUBLIC
+    -Wl,--no-as-needed
+    "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so"
+    "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
 target_include_directories(
   lammps PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}
                  ${LAMMPS_SOURCE_DIR}/KSPACE ${LAMMPS_SOURCE_DIR}/EXTRA-FIX)
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index a30e420387..7aa1bdbe41 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <string.h>
 
+#include <cassert>
 #include <iomanip>
 #include <iostream>
 #include <limits>
 #include <map>
 #include <sstream>
-#include <cassert>
 
 #include "atom.h"
 #include "citeme.h"
@@ -473,7 +473,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
   int newton_pair = force->newton_pair;
 
   // for dpa2 communication
-  // deepmd_compat::CommData* commdata = new deepmd_compat::CommData(); 
+  // deepmd_compat::CommData* commdata = new deepmd_compat::CommData();
   // commdata->nswap = cb->nswap;
   // commdata->sendnum = cb->sendnum; // dim: nswap
   // commdata->recvnum = cb->recvnum; // dim: nswap
@@ -482,9 +482,9 @@ void PairDeepMD::compute(int eflag, int vflag) {
   // commdata->sendproc = cb->sendproc; // dim: nswap
   // commdata->recvproc = cb->recvproc; // dim: nswap
   assert(sizeof(MPI_Comm) == sizeof(int));
-  //std::cout<<"world:"<<world<<std::endl;
+  // std::cout<<"world:"<<world<<std::endl;
   int world_int = world;
-  double* prd = domain->prd;
+  double *prd = domain->prd;
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
   double **sp = atom->sp;
@@ -564,10 +564,14 @@ void PairDeepMD::compute(int eflag, int vflag) {
   multi_models_mod_devi =
       (numb_models > 1 && (out_freq > 0 && update->ntimestep % out_freq == 0));
   if (do_ghost) {
-    deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
-                                       list->firstneigh,commdata_->nswap,commdata_->sendnum,commdata_->recvnum,commdata_->firstrecv,commdata_->sendlist,commdata_->sendproc,commdata_->recvproc,world_int);
+    deepmd_compat::InputNlist lmp_list(
+        list->inum, list->ilist, list->numneigh, list->firstneigh,
+        commdata_->nswap, commdata_->sendnum, commdata_->recvnum,
+        commdata_->firstrecv, commdata_->sendlist, commdata_->sendproc,
+        commdata_->recvproc, world_int);
     // else
-    // deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
+    // deepmd_compat::InputNlist lmp_list(list->inum, list->ilist,
+    // list->numneigh,
     //                                    list->firstneigh);
     deepmd_compat::InputNlist extend_lmp_list;
     if (atom->sp_flag) {
@@ -953,7 +957,7 @@ void PairDeepMD::settings(int narg, char **arg) {
   if (narg <= 0) {
     error->all(FLERR, "Illegal pair_style command");
   }
-  
+
   vector<string> models;
   int iarg = 0;
   while (iarg < narg) {
@@ -1293,7 +1297,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
     }
   }
 
-  //dpa2 communication
+  // dpa2 communication
   commdata_ = (CommBrickDeepMD *)comm;
 }
 
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index 9255e36c6d..bb4e48cfa1 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -37,7 +37,7 @@ namespace deepmd_compat = deepmd::hpp;
 #define FLOAT_PREC double
 
 namespace LAMMPS_NS {
-class CommBrickDeepMD : public CommBrick{
+class CommBrickDeepMD : public CommBrick {
   friend class PairDeepMD;
 };
 class PairDeepMD : public Pair {
@@ -141,7 +141,7 @@ class PairDeepMD : public Pair {
   tagint *tagsend, *tagrecv;
   double *stdfsend, *stdfrecv;
   std::vector<int> type_idx_map;
-  CommBrickDeepMD* commdata_;
+  CommBrickDeepMD *commdata_;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
index 4b0ccb5355..cfe29b5867 100644
--- a/source/lmp/plugin/CMakeLists.txt
+++ b/source/lmp/plugin/CMakeLists.txt
@@ -85,7 +85,12 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
     target_compile_definitions(${libname} PUBLIC "DP_USE_CXX_API")
   endif()
   target_link_libraries(${libname} PUBLIC lammps_interface)
-  target_link_libraries(${libname}  PUBLIC -Wl,--no-as-needed "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
+  target_link_libraries(
+    ${libname}
+    PUBLIC
+      -Wl,--no-as-needed
+      "/mnt/user/zhangxiangyu/workspace/dpkit/deepmd-kit/source/op_pt/libop_pt.so"
+      "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
   target_include_directories(
     ${libname}
     PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/..
diff --git a/source/op_pt/CMakeLists.txt b/source/op_pt/CMakeLists.txt
index 90c79f1576..57f566b6e7 100644
--- a/source/op_pt/CMakeLists.txt
+++ b/source/op_pt/CMakeLists.txt
@@ -7,11 +7,11 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 find_package(Python REQUIRED COMPONENTS Development)
 find_package(Torch REQUIRED)
-#find_package(MPI REQUIRED)
+# find_package(MPI REQUIRED)
 find_package(CUDA REQUIRED)
-if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-    set(CMAKE_CUDA_ARCHITECTURES 80)
-endif ()
+if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+  set(CMAKE_CUDA_ARCHITECTURES 80)
+endif()
 option(USE_CUDA "GPU_SUPPORT" ON)
 if(USE_CUDA)
   add_definitions(-DUSE_CUDA)
@@ -19,4 +19,5 @@ endif()
 
 add_library(op_pt SHARED comm.cc)
 target_include_directories(op_pt PRIVATE ${Python_INCLUDE_DIRS})
-target_link_libraries(op_pt "${TORCH_LIBRARIES}" "${Python_LIBRARIES}" "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
+target_link_libraries(op_pt "${TORCH_LIBRARIES}" "${Python_LIBRARIES}"
+                      "/home/zhangxiangyu/.conda/envs/dp-cxxabi/lib/libmpi.so")
diff --git a/source/op_pt/comm.cc b/source/op_pt/comm.cc
index 69be4be331..7cfd578ec6 100644
--- a/source/op_pt/comm.cc
+++ b/source/op_pt/comm.cc
@@ -1,18 +1,20 @@
-#include "custom_op.h"
-#include <cuda_runtime_api.h>
+// SPDX-License-Identifier: LGPL-3.0-or-later
 #include <cuda.h>
-  template <typename T>
-  static MPI_Datatype get_mpi_type();
+#include <cuda_runtime_api.h>
 
-  template <>
-  MPI_Datatype get_mpi_type<float>() {
-    return MPI_FLOAT;
-  }
+#include "custom_op.h"
+template <typename T>
+static MPI_Datatype get_mpi_type();
 
-  template <>
-  MPI_Datatype get_mpi_type<double>() {
-    return MPI_DOUBLE;
-  }
+template <>
+MPI_Datatype get_mpi_type<float>() {
+  return MPI_FLOAT;
+}
+
+template <>
+MPI_Datatype get_mpi_type<double>() {
+  return MPI_DOUBLE;
+}
 
 class Border : public torch::autograd::Function<Border> {
  public:
@@ -44,8 +46,7 @@ class Border : public torch::autograd::Function<Border> {
     int ntotal = nlocal + nghost;
     torch::Tensor recv_g1_tensor = g1;
 
-    FPTYPE* recv_g1 =
-        recv_g1_tensor.data_ptr<FPTYPE>() + nlocal * tensor_size;
+    FPTYPE* recv_g1 = recv_g1_tensor.data_ptr<FPTYPE>() + nlocal * tensor_size;
 
     int me;
     MPI_Comm_rank(MPI_COMM_WORLD, &me);
@@ -53,40 +54,38 @@ class Border : public torch::autograd::Function<Border> {
     unpack_communicator(communicator_tensor, world);
     MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
     MPI_Request request;
-    auto int32_options = torch::TensorOptions()
-                             .dtype(torch::kInt32);
-    std::cout<<"nswap: "<<nswap<<std::endl;
+    auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+    std::cout << "nswap: " << nswap << std::endl;
     for (int iswap = 0; iswap < nswap; ++iswap) {
-      std::cout<<"num"<<iswap<<std::endl;
+      std::cout << "num" << iswap << std::endl;
       int nrecv = recvnum[iswap];
       int nsend = sendnum[iswap];
       torch::Tensor isendlist =
-          torch::from_blob(sendlist[iswap], {nsend}, int32_options).to(recv_g1_tensor.device());
-      torch::Tensor send_g1_tensor =
-          recv_g1_tensor.index_select(0, isendlist);
+          torch::from_blob(sendlist[iswap], {nsend}, int32_options)
+              .to(recv_g1_tensor.device());
+      torch::Tensor send_g1_tensor = recv_g1_tensor.index_select(0, isendlist);
       FPTYPE* send_g1 = send_g1_tensor.data_ptr<FPTYPE>();
       if (sendproc[iswap] != me) {
         if (nrecv) {
-          std::cout<<"recv"<<std::endl;
-          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type,
-                    recvproc[iswap], 0, world, &request);
+          std::cout << "recv" << std::endl;
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                    world, &request);
         }
         if (nsend) {
-          std::cout<<"send"<<std::endl;
-          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap],
-                   0, world);
+          std::cout << "send" << std::endl;
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                   world);
         }
         if (nrecv) {
-          std::cout<<"wait"<<std::endl;
+          std::cout << "wait" << std::endl;
           MPI_Wait(&request, MPI_STATUS_IGNORE);
         }
       } else {
 #ifdef USE_CUDA
-        cudaMemcpy(recv_g1, send_g1,
-                         nsend * tensor_size * sizeof(FPTYPE), cudaMemcpyDeviceToDevice);
+        cudaMemcpy(recv_g1, send_g1, nsend * tensor_size * sizeof(FPTYPE),
+                   cudaMemcpyDeviceToDevice);
 #else
-        memcpy(recv_g1, send_g1,
-               nsend * tensor_size * sizeof(FPTYPE));
+        memcpy(recv_g1, send_g1, nsend * tensor_size * sizeof(FPTYPE));
 #endif
       }
       recv_g1 += nrecv * tensor_size;
@@ -137,8 +136,7 @@ class Border : public torch::autograd::Function<Border> {
     torch::Tensor recv_g1_tensor =
         torch::empty({max_recvnum, tensor_size}, options);
     FPTYPE* recv_g1 = recv_g1_tensor.data_ptr<FPTYPE>();
-    FPTYPE* send_g1 =
-        send_g1_tensor.data_ptr<FPTYPE>() + ntotal * tensor_size;
+    FPTYPE* send_g1 = send_g1_tensor.data_ptr<FPTYPE>() + ntotal * tensor_size;
 
     MPI_Comm world;
     unpack_communicator(communicator_tensor, world);
@@ -149,66 +147,56 @@ class Border : public torch::autograd::Function<Border> {
 
     std::string msg;
 
-
     int end = ntotal;
-    auto int32_options = torch::TensorOptions()
-                             .dtype(torch::kInt32);
-    std::cout<<"nswap backward"<<nswap<<std::endl;
+    auto int32_options = torch::TensorOptions().dtype(torch::kInt32);
+    std::cout << "nswap backward" << nswap << std::endl;
     for (int iswap = nswap - 1; iswap >= 0; --iswap) {
-
-
       int nrecv = recvnum[iswap];
       int nsend = sendnum[iswap];
 
       torch::Tensor irecvlist;
       if (nrecv) {
-        irecvlist = torch::from_blob(recvlist[iswap], {nrecv}, int32_options).to(d_local_g1_tensor.device());
+        irecvlist = torch::from_blob(recvlist[iswap], {nrecv}, int32_options)
+                        .to(d_local_g1_tensor.device());
       }
       if (nsend) {
         send_g1 -= nsend * tensor_size;
       }
       if (sendproc[iswap] != me) {
         if (nrecv) {
-          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type,
-                    recvproc[iswap], 0, world, &request);
+          MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                    world, &request);
         }
         if (nsend) {
-
-          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap],
-                   0, world);
+          MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                   world);
         }
         if (nrecv) {
-
           MPI_Wait(&request, MPI_STATUS_IGNORE);
-
         }
       } else {
         if (nrecv) {
-
 #ifdef USE_CUDA
-          cudaMemcpy(recv_g1, send_g1,
-                         nrecv * tensor_size * sizeof(FPTYPE), cudaMemcpyDeviceToDevice);
+          cudaMemcpy(recv_g1, send_g1, nrecv * tensor_size * sizeof(FPTYPE),
+                     cudaMemcpyDeviceToDevice);
 #else
-          memcpy(recv_g1, send_g1,
-                 nrecv * tensor_size * sizeof(FPTYPE));
+          memcpy(recv_g1, send_g1, nrecv * tensor_size * sizeof(FPTYPE));
 #endif
-
         }
       }
       if (nrecv) {
-        d_local_g1_tensor.index_add_(
-            0, irecvlist, recv_g1_tensor.slice(0, 0, nrecv));
+        d_local_g1_tensor.index_add_(0, irecvlist,
+                                     recv_g1_tensor.slice(0, 0, nrecv));
       }
-
     }
 #ifdef USE_CUDA
     cudaDeviceSynchronize();
 #endif
 
-    return {torch::Tensor(),       torch::Tensor(), torch::Tensor(),
-            torch::Tensor(),       torch::Tensor(), d_local_g1_tensor,
-            torch::Tensor(), torch::Tensor(),
-            torch::Tensor(),       torch::Tensor()};
+    return {torch::Tensor(), torch::Tensor(), torch::Tensor(),
+            torch::Tensor(), torch::Tensor(), d_local_g1_tensor,
+            torch::Tensor(), torch::Tensor(), torch::Tensor(),
+            torch::Tensor()};
   }
   static void unpack_communicator(const torch::Tensor& communicator_tensor,
                                   MPI_Comm& mpi_comm) {
@@ -226,8 +214,8 @@ std::vector<torch::Tensor> border_op(const torch::Tensor& sendlist_tensor,
                                      const torch::Tensor& nlocal_tensor,
                                      const torch::Tensor& nghost_tensor) {
   return Border::apply(sendlist_tensor, sendproc_tensor, recvproc_tensor,
-                       sendnum_tensor, recvnum_tensor, g1_tensor,communicator_tensor, nlocal_tensor,
-                       nghost_tensor);
+                       sendnum_tensor, recvnum_tensor, g1_tensor,
+                       communicator_tensor, nlocal_tensor, nghost_tensor);
 }
 
 TORCH_LIBRARY_FRAGMENT(my_ops, m) { m.def("border_op", border_op); }
diff --git a/source/op_pt/custom_op.h b/source/op_pt/custom_op.h
index e8724181da..1b4dba62f1 100644
--- a/source/op_pt/custom_op.h
+++ b/source/op_pt/custom_op.h
@@ -1,2 +1,3 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include <mpi.h>
 #include <torch/torch.h>
-#include <mpi.h>
\ No newline at end of file