From 9fb24a323930b6bc64d69d605641e4ad9977e916 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 00:48:50 -0400
Subject: [PATCH 1/6] allocate int_temp and uint64_temp out of loop

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 159 +++++++++++--------------
 1 file changed, 69 insertions(+), 90 deletions(-)
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a8882fb5f4..8ac1ff3cd5 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -556,6 +556,28 @@ class ProdEnvMatAOp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    Tensor int_temp;
+    Tensor uint64_temp;
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -586,21 +608,6 @@ class ProdEnvMatAOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
                                         gpu_inlist, array_int, array_longlong,
@@ -631,21 +638,6 @@ class ProdEnvMatAOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu_rocm(em, em_deriv, rij, nlist, coord, type,
                                         gpu_inlist, array_int, array_longlong,
@@ -848,6 +840,29 @@ class ProdEnvMatROp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    Tensor int_temp;
+    Tensor uint64_temp;
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -878,21 +893,6 @@ class ProdEnvMatROp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu_cuda(em, em_deriv, rij, nlist, coord, type,
                                         gpu_inlist, array_int, array_longlong,
@@ -923,21 +923,6 @@ class ProdEnvMatROp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_r_gpu_rocm(em, em_deriv, rij, nlist, coord, type,
                                         gpu_inlist, array_int, array_longlong,
@@ -1190,6 +1175,30 @@ class ProdEnvMatAMixOp : public OpKernel {
       }
     }
 
+    // must declar out of if, otherwise the memory will be destroyed!
+    Tensor int_temp;
+    Tensor uint64_temp;
+    if (device == "GPU") {
+      // allocate temp memory only once for multiple frames
+      // allocate temp memory, temp memory must not be used after this
+      // operation!
+
+      // used for format_nbor_list_gpu_cuda
+
+      TensorShape int_shape;
+      int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
+      OP_REQUIRES_OK(context,
+                     context->allocate_temp(DT_INT32, int_shape, &int_temp));
+
+      TensorShape uint64_shape;
+      uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
+      OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
+                                                     &uint64_temp));
+
+      array_int = int_temp.flat<int>().data();
+      array_longlong = uint64_temp.flat<unsigned long long>().data();
+    }
+
     // loop over samples
     for (int_64 ff = 0; ff < nsamples; ++ff) {
       FPTYPE* em = p_em + ff * nloc * ndescrpt;
@@ -1223,21 +1232,6 @@ class ProdEnvMatAMixOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu_cuda(
             em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int,
@@ -1267,21 +1261,6 @@ class ProdEnvMatAMixOp : public OpKernel {
             mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
             rcut_r, max_cpy_trial, max_nnei_trial);
 
-        // allocate temp memory, temp memory must not be used after this
-        // operation!
-        Tensor int_temp;
-        TensorShape int_shape;
-        int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc);
-        OP_REQUIRES_OK(context,
-                       context->allocate_temp(DT_INT32, int_shape, &int_temp));
-        Tensor uint64_temp;
-        TensorShape uint64_shape;
-        uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2);
-        OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape,
-                                                       &uint64_temp));
-        array_int = int_temp.flat<int>().data();
-        array_longlong = uint64_temp.flat<unsigned long long>().data();
-
         // launch the gpu(nv) compute function
         deepmd::prod_env_mat_a_gpu_rocm(
             em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int,

From 4b832213d510c6f8f288fdc9cad6e81b4c209cd6 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 01:12:30 -0400
Subject: [PATCH 2/6] allocate tensor_list[0,1,3,4,5,6] out of loop

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 167 ++++++++++++++++++-------
 1 file changed, 122 insertions(+), 45 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 8ac1ff3cd5..8c4b9c2840 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -556,12 +556,53 @@ class ProdEnvMatAOp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    // must declar out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
     if (device == "GPU") {
       // allocate temp memory only once for multiple frames
       // allocate temp memory, temp memory must not be used after this
       // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
 
       // used for format_nbor_list_gpu_cuda
 
@@ -599,7 +640,6 @@ class ProdEnvMatAOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -840,12 +880,53 @@ class ProdEnvMatROp : public OpKernel {
     const FPTYPE* std = std_tensor.flat<FPTYPE>().data();
     const int* p_type = type_tensor.flat<int>().data();
 
+    // must declar out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
     if (device == "GPU") {
       // allocate temp memory only once for multiple frames
       // allocate temp memory, temp memory must not be used after this
       // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
 
       // used for format_nbor_list_gpu_cuda
 
@@ -884,7 +965,6 @@ class ProdEnvMatROp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -1178,10 +1258,50 @@ class ProdEnvMatAMixOp : public OpKernel {
     // must declar out of if, otherwise the memory will be destroyed!
     Tensor int_temp;
     Tensor uint64_temp;
+    std::vector<Tensor> tensor_list(7);
     if (device == "GPU") {
       // allocate temp memory only once for multiple frames
       // allocate temp memory, temp memory must not be used after this
       // operation!
+      if (nei_mode != 3) {
+        if (nei_mode == 1) {
+          // Tensor FPTYPE_temp;
+          TensorShape FPTYPE_shape;
+          FPTYPE_shape.AddDim(nall * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                FPTYPE_shape, &tensor_list[0]));
+
+          // Tensor double_temp;
+          TensorShape double_shape;
+          double_shape.AddDim(18);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                double_shape, &tensor_list[1]));
+          // Tensor cpy_temp;
+          TensorShape cpy_shape;
+          cpy_shape.AddDim(mem_cpy * 3);
+          OP_REQUIRES_OK(context,
+                         context->allocate_temp(DataTypeToEnum<FPTYPE>::value,
+                                                cpy_shape, &tensor_list[3]));
+          // Tensor t_temp;
+          TensorShape t_shape;
+          t_shape.AddDim(mem_cpy * 2);
+          OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape,
+                                                         &tensor_list[4]));
+        }
+
+        // Tensor nlist_temp;
+        TensorShape nlist_shape;
+        nlist_shape.AddDim(nloc * 2);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape,
+                                                       &tensor_list[5]));
+
+        TensorShape jlist_shape;
+        jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+        OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape,
+                                                       &tensor_list[6]));
+      }
 
       // used for format_nbor_list_gpu_cuda
 
@@ -1223,7 +1343,6 @@ class ProdEnvMatAMixOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
@@ -1477,14 +1596,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& nloc,
                                 const int& max_cpy_trial,
                                 const float& rcut_r) {
-  // Tensor FPTYPE_temp;
-  TensorShape FPTYPE_shape;
-  FPTYPE_shape.AddDim(nall * 3);
-  tensorflow::Status status = context->allocate_temp(
-      DataTypeToEnum<FPTYPE>::value, FPTYPE_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
   DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
                         cudaMemcpyDeviceToDevice));
@@ -1498,14 +1609,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::compute_cell_info(cell_info, rcut_r, region);
   const int loc_cellnum = cell_info[21];
   const int total_cellnum = cell_info[22];
-  // Tensor double_temp;
-  TensorShape double_shape;
-  double_shape.AddDim(18);
-  status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, double_shape,
-                                  tensor_list + 1);
-  if (!status.ok()) {
-    return false;
-  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
@@ -1525,18 +1628,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    // Tensor cpy_temp;
-    TensorShape cpy_shape;
-    cpy_shape.AddDim(mem_cpy * 3);
-    status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
-                                    tensor_list + 3);
-    if (!status.ok()) {
-      return false;
-    }
-    // Tensor t_temp;
-    TensorShape t_shape;
-    t_shape.AddDim(mem_cpy * 2);
-    context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
     coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
     type_cpy = (*(tensor_list + 4)).flat<int>().data();
     idx_mapping = type_cpy + mem_cpy;
@@ -1568,14 +1659,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
                             const int& new_nall,
                             const int& max_nnei_trial,
                             const float& rcut_r) {
-  // Tensor nlist_temp;
-  TensorShape nlist_shape;
-  nlist_shape.AddDim(nloc * 2);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1584,12 +1667,6 @@ static int _build_nlist_gpu(OpKernelContext* context,
   std::vector<int*> firstneigh_host(nloc);
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
-    TensorShape jlist_shape;
-    jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-    if (!status.ok()) {
-      return false;
-    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {

From f373e4bf9cee5558bc6feb06c069264e09c45b8e Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 01:56:08 -0400
Subject: [PATCH 3/6] *=2

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 44 ++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 8c4b9c2840..a015648ae9 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1638,6 +1638,21 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       break;
     } else {
       mem_cpy *= 2;
+      // Tensor cpy_temp;
+      TensorShape cpy_shape;
+      cpy_shape.AddDim(mem_cpy * 3);
+      tensorflow::Status status = context->allocate_temp(
+          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
+      if (!status.ok()) {
+        return false;
+      }
+      // Tensor t_temp;
+      TensorShape t_shape;
+      t_shape.AddDim(mem_cpy * 2);
+      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   region_dev.boxt = new_boxt;
@@ -1680,6 +1695,13 @@ static int _build_nlist_gpu(OpKernelContext* context,
       break;
     } else {
       mem_nnei *= 2;
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      tensorflow::Status status =
+          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   return (tt != max_nnei_trial);
@@ -1857,6 +1879,21 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
       break;
     } else {
       mem_cpy *= 2;
+      // Tensor cpy_temp;
+      TensorShape cpy_shape;
+      cpy_shape.AddDim(mem_cpy * 3);
+      tensorflow::Status status = context->allocate_temp(
+          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
+      if (!status.ok()) {
+        return false;
+      }
+      // Tensor t_temp;
+      TensorShape t_shape;
+      t_shape.AddDim(mem_cpy * 2);
+      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   region_dev.boxt = new_boxt;
@@ -1913,6 +1950,13 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
       break;
     } else {
       mem_nnei *= 2;
+      TensorShape jlist_shape;
+      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
+      tensorflow::Status status =
+          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
+      if (!status.ok()) {
+        return false;
+      }
     }
   }
   return (tt != max_nnei_trial);

From c8654cac0bf557e3c58ef033b37ac420de4c73a1 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 01:59:14 -0400
Subject: [PATCH 4/6] clean rocm

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 42 +++-----------------------
 1 file changed, 5 insertions(+), 37 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a015648ae9..a1b111b5e7 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1815,11 +1815,6 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
                                      const int& nloc,
                                      const int& max_cpy_trial,
                                      const float& rcut_r) {
-  // Tensor FPTYPE_temp;
-  TensorShape FPTYPE_shape;
-  FPTYPE_shape.AddDim(nall * 3);
-  context->allocate_temp(DataTypeToEnum<FPTYPE>::value, FPTYPE_shape,
-                         tensor_list);
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
   DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
                        hipMemcpyDeviceToDevice));
@@ -1833,20 +1828,16 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
   deepmd::compute_cell_info(cell_info, rcut_r, region);
   const int loc_cellnum = cell_info[21];
   const int total_cellnum = cell_info[22];
-  // Tensor double_temp;
-  TensorShape double_shape;
-  double_shape.AddDim(18);
-  tensorflow::Status status = context->allocate_temp(
-      DataTypeToEnum<FPTYPE>::value, double_shape, tensor_list + 1);
-  if (!status.ok()) {
-    return false;
-  }
   // Tensor int_temp;
   TensorShape int_shape;
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
                    nloc);
-  context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  if (!status.ok()) {
+    return false;
+  }
   FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
   int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
   int* int_data_dev = cell_info_dev + 23;
@@ -1860,15 +1851,6 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context,
   deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev);
   int tt;
   for (tt = 0; tt < max_cpy_trial; ++tt) {
-    // Tensor cpy_temp;
-    TensorShape cpy_shape;
-    cpy_shape.AddDim(mem_cpy * 3);
-    context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
-                           tensor_list + 3);
-    // Tensor t_temp;
-    TensorShape t_shape;
-    t_shape.AddDim(mem_cpy * 2);
-    context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
     coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
     type_cpy = (*(tensor_list + 4)).flat<int>().data();
     idx_mapping = type_cpy + mem_cpy;
@@ -1915,14 +1897,6 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
                                  const int& new_nall,
                                  const int& max_nnei_trial,
                                  const float& rcut_r) {
-  // Tensor nlist_temp;
-  TensorShape nlist_shape;
-  nlist_shape.AddDim(nloc * 2);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, nlist_shape, tensor_list);
-  if (!status.ok()) {
-    return false;
-  }
   ilist = (*tensor_list).flat<int>().data();
   numneigh = ilist + nloc;
   // Tensor jlist_temp;
@@ -1931,12 +1905,6 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context,
   std::vector<int*> firstneigh_host(nloc);
   int tt;
   for (tt = 0; tt < max_nnei_trial; ++tt) {
-    TensorShape jlist_shape;
-    jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-    status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-    if (!status.ok()) {
-      return false;
-    }
     jlist = (*(tensor_list + 1)).flat<int>().data();
     ind_data = jlist + nloc * mem_nnei;
     for (int_64 ii = 0; ii < nloc; ++ii) {

From 25c25be6a0e1fbc1ad0477f367f9a67e96f98673 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 02:08:54 -0400
Subject: [PATCH 5/6] fix status

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index a1b111b5e7..1ac2071611 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1614,7 +1614,11 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
   int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
                    nloc);
-  context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  tensorflow::Status status =
+      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
+  if (!status.ok()) {
+    return false;
+  }
   FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
   int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
   int* int_data_dev = cell_info_dev + 23;
@@ -1641,8 +1645,8 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
       // Tensor cpy_temp;
       TensorShape cpy_shape;
       cpy_shape.AddDim(mem_cpy * 3);
-      tensorflow::Status status = context->allocate_temp(
-          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
+      status = context->allocate_temp(DataTypeToEnum<FPTYPE>::value, cpy_shape,
+                                      tensor_list + 3);
       if (!status.ok()) {
         return false;
       }

From 27d8e5f2296155a8fed44002ffea2355c686f823 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Sat, 16 Sep 2023 02:57:10 -0400
Subject: [PATCH 6/6] clean rocm

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/op/prod_env_mat_multi_device.cc | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index 1ac2071611..862fb5856f 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -669,7 +669,6 @@ class ProdEnvMatAOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu_rocm<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -994,7 +993,6 @@ class ProdEnvMatROp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu_rocm<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
@@ -1371,7 +1369,6 @@ class ProdEnvMatAMixOp : public OpKernel {
         int* type_cpy;
         int frame_nall = nall;
         int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        std::vector<Tensor> tensor_list(7);
         // prepare coord and nlist
         _prepare_coord_nlist_gpu_rocm<FPTYPE>(
             context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,