From 938cea286dff1e54dd5ad9784ffd36799c2543ca Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 26 Sep 2023 03:11:22 -0400
Subject: [PATCH] merge CUDA and ROCm codes in op (#2847)

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/gpu_cuda.h                |   1 +
 source/lib/include/gpu_rocm.h                |   1 +
 source/op/gelu_multi_device.cc               |  24 +-
 source/op/prod_env_mat_multi_device.cc       | 454 +------------------
 source/op/prod_env_mat_multi_device_nvnmd.cc |  16 +-
 source/op/prod_force_grad_multi_device.cc    |  18 +-
 source/op/prod_force_multi_device.cc         |  18 +-
 source/op/prod_virial_grad_multi_device.cc   |  18 +-
 source/op/prod_virial_multi_device.cc        |  18 +-
 source/op/tabulate_multi_device.cc           | 102 +----
 source/op/unaggregated_grad.cc               |   4 +-
 11 files changed, 66 insertions(+), 608 deletions(-)

diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 73dfed1404..1e750e0ea0 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -13,6 +13,7 @@
 #define gpuMemcpy cudaMemcpy
 #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
 #define gpuMemset cudaMemset
 
 #define GPU_MAX_NBOR_SIZE 4096
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index 3a65a57b01..bb404720bc 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -16,6 +16,7 @@
 #define gpuMemcpy hipMemcpy
 #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define gpuMemset hipMemset
 
 #define DPErrcheck(res) \
diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc
index ccc95aa0e4..1c76cd25d3 100644
--- a/source/op/gelu_multi_device.cc
+++ b/source/op/gelu_multi_device.cc
@@ -64,13 +64,9 @@ class GeluOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_gpu(out, x, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_gpu(out, x, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_cpu(out, x, size);
     }
@@ -108,13 +104,9 @@ class GeluGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_gpu(out, x, dy, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_cpu(out, x, dy, size);
     }
@@ -154,13 +146,9 @@ class GeluGradGradOp : public OpKernel {
     const int_64 size = static_cast<int_64>(output_tensor->NumElements());
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size);
     }
diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc
index ee07dc22fe..47541bc69f 100644
--- a/source/op/prod_env_mat_multi_device.cc
+++ b/source/op/prod_env_mat_multi_device.cc
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include "coord.h"
 #include "custom_op.h"
+#include "device.h"
 #include "errors.h"
 #include "neighbor_list.h"
 #include "prod_env_mat.h"
@@ -243,82 +244,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial);
 
-#if GOOGLE_CUDA
-template <typename FPTYPE>
-static int _norm_copy_coord_gpu(OpKernelContext* context,
-                                Tensor* tensor_list,
-                                FPTYPE*& coord_cpy,
-                                int*& type_cpy,
-                                int*& idx_mapping,
-                                int& nall,
-                                int& mem_cpy,
-                                const FPTYPE* coord,
-                                const FPTYPE* box,
-                                const int* type,
-                                const int& nloc,
-                                const int& max_cpy_trial,
-                                const float& rcut_r);
-
-template <typename FPTYPE>
-static int _build_nlist_gpu(OpKernelContext* context,
-                            Tensor* tensor_list,
-                            int*& ilist,
-                            int*& numneigh,
-                            int**& firstneigh,
-                            int*& jlist,
-                            int& max_nnei,
-                            int& mem_nnei,
-                            const FPTYPE* coord,
-                            const int& nloc,
-                            const int& new_nall,
-                            const int& max_nnei_trial,
-                            const float& rcut_r);
-
-static void _map_nlist_gpu(int* nlist,
-                           const int* idx_mapping,
-                           const int& nloc,
-                           const int& nnei);
-
-static void _map_nei_info_gpu(int* nlist,
-                              int* ntype,
-                              bool* nmask,
-                              const int* type,
-                              const int* idx_mapping,
-                              const int& nloc,
-                              const int& nnei,
-                              const int& ntypes,
-                              const bool& b_nlist_map);
-
-template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial);
-
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 Tensor* tensor_list,
@@ -391,7 +317,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                                      const int& max_cpy_trial,
                                      const int& max_nnei_trial);
 
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 template <typename Device, typename FPTYPE>
 class ProdEnvMatAOp : public OpKernel {
@@ -633,36 +559,7 @@ class ProdEnvMatAOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut_r, rcut_r_smth, sec_a);
-        if (b_nlist_map) {
-          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
-        }
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -689,7 +586,7 @@ class ProdEnvMatAOp : public OpKernel {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -960,36 +857,7 @@ class ProdEnvMatROp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut, rcut_smth, sec);
-        if (b_nlist_map) {
-          _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
-        }
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -1016,7 +884,7 @@ class ProdEnvMatROp : public OpKernel {
           _map_nlist_gpu(nlist, idx_mapping, nloc, nnei);
         }
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -1248,10 +1116,7 @@ class ProdEnvMatAMixOp : public OpKernel {
     int* p_f_type = fake_type_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
-#endif
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall);
 #endif
     } else if (device == "CPU") {
@@ -1338,35 +1203,7 @@ class ProdEnvMatAMixOp : public OpKernel {
       const int* f_type = p_f_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        int* idx_mapping = NULL;
-        int *ilist = NULL, *numneigh = NULL;
-        int** firstneigh = NULL;
-        deepmd::malloc_device_memory(firstneigh, nloc);
-        int* jlist = NULL;
-        FPTYPE* coord_cpy;
-        int* type_cpy;
-        int frame_nall = nall;
-        int mesh_tensor_size = static_cast<int>(mesh_tensor.NumElements());
-        // prepare coord and nlist
-        _prepare_coord_nlist_gpu<FPTYPE>(
-            context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,
-            idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist,
-            nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box,
-            mesh_tensor.flat<int>().data(), mesh_tensor_size, nloc, nei_mode,
-            rcut_r, max_cpy_trial, max_nnei_trial);
-
-        // launch the gpu(nv) compute function
-        deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type,
-                                   gpu_inlist, array_int, array_longlong,
-                                   max_nbor_size, avg, std, nloc, frame_nall,
-                                   rcut_r, rcut_r_smth, sec_a, f_type);
-        _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
-                          ntypes, b_nlist_map);
-        deepmd::delete_device_memory(firstneigh);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         int* idx_mapping = NULL;
         int *ilist = NULL, *numneigh = NULL;
         int** firstneigh = NULL;
@@ -1392,7 +1229,7 @@ class ProdEnvMatAMixOp : public OpKernel {
         _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
                           ntypes, b_nlist_map);
         deepmd::delete_device_memory(firstneigh);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -1616,7 +1453,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context,
   }
 }
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 Tensor* tensor_list,
@@ -1632,8 +1469,8 @@ static int _norm_copy_coord_gpu(OpKernelContext* context,
                                 const int& max_cpy_trial,
                                 const float& rcut_r) {
   FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
-                        cudaMemcpyDeviceToDevice));
+  DPErrcheck(gpuMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
+                       gpuMemcpyDeviceToDevice));
 
   deepmd::Region<FPTYPE> region;
   init_region_cpu(region, box);
@@ -1877,270 +1714,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context,
                     ", which currently is not supported by deepmd-kit."));
   }
 }
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-static int _norm_copy_coord_gpu(OpKernelContext* context,
-                                Tensor* tensor_list,
-                                FPTYPE*& coord_cpy,
-                                int*& type_cpy,
-                                int*& idx_mapping,
-                                int& nall,
-                                int& mem_cpy,
-                                const FPTYPE* coord,
-                                const FPTYPE* box,
-                                const int* type,
-                                const int& nloc,
-                                const int& max_cpy_trial,
-                                const float& rcut_r) {
-  FPTYPE* tmp_coord = (*tensor_list).flat<FPTYPE>().data();
-  DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3,
-                       hipMemcpyDeviceToDevice));
-
-  deepmd::Region<FPTYPE> region;
-  init_region_cpu(region, box);
-  FPTYPE box_info[18];
-  std::copy(region.boxt, region.boxt + 9, box_info);
-  std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9);
-  int cell_info[23];
-  deepmd::compute_cell_info(cell_info, rcut_r, region);
-  const int loc_cellnum = cell_info[21];
-  const int total_cellnum = cell_info[22];
-  // Tensor int_temp;
-  TensorShape int_shape;
-  int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 +
-                   total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 +
-                   nloc);
-  tensorflow::Status status =
-      context->allocate_temp(DT_INT32, int_shape, tensor_list + 2);
-  if (!status.ok()) {
-    return false;
-  }
-  FPTYPE* box_info_dev = (*(tensor_list + 1)).flat<FPTYPE>().data();
-  int* cell_info_dev = (*(tensor_list + 2)).flat<int>().data();
-  int* int_data_dev = cell_info_dev + 23;
-  deepmd::memcpy_host_to_device(box_info_dev, box_info, 18);
-  deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23);
-  deepmd::Region<FPTYPE> region_dev;
-  FPTYPE* new_boxt = region_dev.boxt;
-  FPTYPE* new_rec_boxt = region_dev.rec_boxt;
-  region_dev.boxt = box_info_dev;
-  region_dev.rec_boxt = box_info_dev + 9;
-  deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev);
-  int tt;
-  for (tt = 0; tt < max_cpy_trial; ++tt) {
-    coord_cpy = (*(tensor_list + 3)).flat<FPTYPE>().data();
-    type_cpy = (*(tensor_list + 4)).flat<int>().data();
-    idx_mapping = type_cpy + mem_cpy;
-    int ret = deepmd::copy_coord_gpu(
-        coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type,
-        nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev);
-    if (ret == 0) {
-      break;
-    } else {
-      mem_cpy *= 2;
-      // Tensor cpy_temp;
-      TensorShape cpy_shape;
-      cpy_shape.AddDim(mem_cpy * 3);
-      tensorflow::Status status = context->allocate_temp(
-          DataTypeToEnum<FPTYPE>::value, cpy_shape, tensor_list + 3);
-      if (!status.ok()) {
-        return false;
-      }
-      // Tensor t_temp;
-      TensorShape t_shape;
-      t_shape.AddDim(mem_cpy * 2);
-      status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4);
-      if (!status.ok()) {
-        return false;
-      }
-    }
-  }
-  region_dev.boxt = new_boxt;
-  region_dev.rec_boxt = new_rec_boxt;
-  return (tt != max_cpy_trial);
-}
-
-template <typename FPTYPE>
-static int _build_nlist_gpu(OpKernelContext* context,
-                            Tensor* tensor_list,
-                            int*& ilist,
-                            int*& numneigh,
-                            int**& firstneigh,
-                            int*& jlist,
-                            int& max_nnei,
-                            int& mem_nnei,
-                            const FPTYPE* coord,
-                            const int& nloc,
-                            const int& new_nall,
-                            const int& max_nnei_trial,
-                            const float& rcut_r) {
-  ilist = (*tensor_list).flat<int>().data();
-  numneigh = ilist + nloc;
-  // Tensor jlist_temp;
-  int* ind_data = NULL;
-
-  std::vector<int*> firstneigh_host(nloc);
-  int tt;
-  for (tt = 0; tt < max_nnei_trial; ++tt) {
-    jlist = (*(tensor_list + 1)).flat<int>().data();
-    ind_data = jlist + nloc * mem_nnei;
-    for (int_64 ii = 0; ii < nloc; ++ii) {
-      firstneigh_host[ii] = jlist + ii * mem_nnei;
-    }
-    deepmd::memcpy_host_to_device(firstneigh, firstneigh_host);
-    deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh);
-    int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc,
-                                      new_nall, mem_nnei, rcut_r);
-    if (ret == 0) {
-      break;
-    } else {
-      mem_nnei *= 2;
-      TensorShape jlist_shape;
-      jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei);
-      tensorflow::Status status =
-          context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1);
-      if (!status.ok()) {
-        return false;
-      }
-    }
-  }
-  return (tt != max_nnei_trial);
-}
-
-static void _map_nlist_gpu(int* nlist,
-                           const int* idx_mapping,
-                           const int& nloc,
-                           const int& nnei) {
-  deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei);
-}
-
-static void _map_nei_info_gpu(int* nlist,
-                              int* ntype,
-                              bool* nmask,
-                              const int* type,
-                              const int* idx_mapping,
-                              const int& nloc,
-                              const int& nnei,
-                              const int& ntypes,
-                              const bool& b_nlist_map) {
-  deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei,
-                           ntypes, b_nlist_map);
-}
-
-template <typename FPTYPE>
-static void _prepare_coord_nlist_gpu(OpKernelContext* context,
-                                     Tensor* tensor_list,
-                                     FPTYPE const** coord,
-                                     FPTYPE*& coord_cpy,
-                                     int const** type,
-                                     int*& type_cpy,
-                                     int*& idx_mapping,
-                                     deepmd::InputNlist& inlist,
-                                     int*& ilist,
-                                     int*& numneigh,
-                                     int**& firstneigh,
-                                     int*& jlist,
-                                     int*& nbor_list_dev,
-                                     int& new_nall,
-                                     int& mem_cpy,
-                                     int& mem_nnei,
-                                     int& max_nbor_size,
-                                     const FPTYPE* box,
-                                     const int* mesh_tensor_data,
-                                     const int mesh_tensor_size,
-                                     const int& nloc,
-                                     const int& nei_mode,
-                                     const float& rcut_r,
-                                     const int& max_cpy_trial,
-                                     const int& max_nnei_trial) {
-  if (nei_mode != 3 && nei_mode != 4) {
-    inlist.inum = nloc;
-    // build nlist by myself
-    // normalize and copy coord
-    if (nei_mode == 1) {
-      int copy_ok = _norm_copy_coord_gpu(
-          context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall,
-          mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r);
-      OP_REQUIRES(context, copy_ok,
-                  errors::Aborted("cannot allocate mem for copied coords"));
-      *coord = coord_cpy;
-      *type = type_cpy;
-    }
-    // build nlist
-    int build_ok =
-        _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh,
-                         jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall,
-                         max_nnei_trial, rcut_r);
-    OP_REQUIRES(context, build_ok,
-                errors::Aborted("cannot allocate mem for nlist"));
-    if (max_nbor_size <= 1024) {
-      max_nbor_size = 1024;
-    } else if (max_nbor_size <= 2048) {
-      max_nbor_size = 2048;
-    } else {
-      max_nbor_size = 4096;
-    }
-    inlist.ilist = ilist;
-    inlist.numneigh = numneigh;
-    inlist.firstneigh = firstneigh;
-  } else if (nei_mode == 4) {
-    // TODO: in theory, it will be faster to put everything on GPUs...
-    std::vector<int> mesh_tensor_data_host(mesh_tensor_size);
-    std::vector<int> ilist_host(nloc);
-    std::vector<int> numneigh_host(nloc);
-    std::vector<int*> firstneigh_host(nloc);
-    std::vector<int> fake_mesh(16);
-
-    // copy from gpu to cpu
-    deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host);
-    std::memcpy(&ilist_host[0], &mesh_tensor_data_host[16], sizeof(int) * nloc);
-    std::memcpy(&numneigh_host[0], &mesh_tensor_data_host[16 + nloc],
-                sizeof(int) * nloc);
-    for (int ii = 0, kk = 0; ii < nloc; ++ii) {
-      firstneigh_host[ii] = &mesh_tensor_data_host[16 + 2 * nloc + kk];
-      kk += numneigh_host[ii];
-    }
-    // make a fake mesh
-    fake_mesh[0] = 0;
-    fake_mesh[1] = nloc;
-    std::memcpy(&fake_mesh[4], &ilist_host, sizeof(int*));
-    std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*));
-    std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**));
-    // copy from cpu to gpu
-    int* fake_mesh_dev = NULL;
-    deepmd::malloc_device_memory(fake_mesh_dev, 16);
-    deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh);
-
-    deepmd::InputNlist inlist_temp;
-    inlist_temp.inum = nloc;
-    // everything should be copied to GPU...
-    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
-                                nbor_list_dev, fake_mesh_dev, 16);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
-    deepmd::delete_device_memory(fake_mesh_dev);
-  } else {
-    // update nbor list
-    deepmd::InputNlist inlist_temp;
-    inlist_temp.inum = nloc;
-    deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size,
-                                nbor_list_dev, mesh_tensor_data,
-                                mesh_tensor_size);
-    OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size),
-                errors::InvalidArgument(
-                    "Assert failed, max neighbor size of atom(lammps) " +
-                    std::to_string(max_numneigh(inlist_temp)) +
-                    " is larger than " + std::to_string(max_nbor_size) +
-                    ", which currently is not supported by deepmd-kit."));
-  }
-}
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Register the CPU kernels.
 // Compatible with v1.3
diff --git a/source/op/prod_env_mat_multi_device_nvnmd.cc b/source/op/prod_env_mat_multi_device_nvnmd.cc
index b5863d1b71..abca947f0a 100644
--- a/source/op/prod_env_mat_multi_device_nvnmd.cc
+++ b/source/op/prod_env_mat_multi_device_nvnmd.cc
@@ -471,13 +471,9 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-// UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
@@ -720,13 +716,9 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel {
       const int* type = p_type + ff * nall;
 
       if (device == "GPU") {
-#if GOOGLE_CUDA
-// UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::InputNlist inlist;
         // some buffers, be freed after the evaluation of this frame
diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc
index 7d8a664a8d..ffcd8f0b8b 100644
--- a/source/op/prod_force_grad_multi_device.cc
+++ b/source/op/prod_force_grad_multi_device.cc
@@ -121,15 +121,10 @@ class ProdForceSeAGradOp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
-                                    nloc, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
@@ -234,15 +229,10 @@ class ProdForceSeRGradOp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
-                                    nloc, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist,
                                     nloc, nnei, nframes);
diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc
index 9d553b1f0c..935b5b9f2f 100644
--- a/source/op/prod_force_multi_device.cc
+++ b/source/op/prod_force_multi_device.cc
@@ -142,15 +142,10 @@ class ProdForceSeAOp : public OpKernel {
     }
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
-                               nall, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes, nloc_loc,
@@ -228,15 +223,10 @@ class ProdForceSeROp : public OpKernel {
     const int* p_nlist = nlist_tensor.flat<int>().data();
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
-                               nall, nnei, nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc,
                                nall, nnei, nframes);
diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc
index ef7d10b3bd..d3e7025e6e 100644
--- a/source/op/prod_virial_grad_multi_device.cc
+++ b/source/op/prod_virial_grad_multi_device.cc
@@ -142,15 +142,10 @@ class ProdVirialSeAGradOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
-                                       nloc, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
@@ -275,15 +270,10 @@ class ProdVirialSeRGradOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
-                                       nloc, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_grad_r_cpu(grad_net, grad, in_deriv, rij, nlist,
                                        nloc, nnei);
diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc
index e3960fc37d..445770e85a 100644
--- a/source/op/prod_virial_multi_device.cc
+++ b/source/op/prod_virial_multi_device.cc
@@ -120,15 +120,10 @@ class ProdVirialSeAOp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
-                                  nlist, nloc, nall, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
@@ -224,15 +219,10 @@ class ProdVirialSeROp : public OpKernel {
       const FPTYPE* rij = p_rij + kk * nloc * nnei * 3;
       const int* nlist = p_nlist + kk * nloc * nnei;
       if (device == "GPU") {
-#if GOOGLE_CUDA
-        deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
-                                  nlist, nloc, nall, nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
         deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       } else if (device == "CPU") {
         deepmd::prod_virial_r_cpu(virial, atom_virial, net_deriv, in_deriv, rij,
                                   nlist, nloc, nall, nnei);
diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc
index 886b9d9a6d..85ea82803a 100644
--- a/source/op/tabulate_multi_device.cc
+++ b/source/op/tabulate_multi_device.cc
@@ -196,15 +196,10 @@ class TabulateFusionSeAOp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
-                                       two_embed, nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size);
@@ -266,17 +261,11 @@ class TabulateFusionSeAGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, two_embed, dy, nloc, nnei,
-                                            last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
                                             last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
@@ -330,16 +319,11 @@ class TabulateFusionSeAGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_grad_gpu(
-          dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei, last_layer_size, is_sorted);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei, last_layer_size, is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
@@ -408,17 +392,11 @@ class TabulateFusionSeAttenOp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
-                                       two_embed, nloc, nnei, last_layer_size,
-                                       is_sorted);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size,
                                        is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em,
                                        two_embed, nloc, nnei, last_layer_size,
@@ -489,17 +467,11 @@ class TabulateFusionSeAttenGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, two_embed, dy, nloc, nnei,
-                                            last_layer_size, is_sorted);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
                                             last_layer_size, is_sorted);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, two_embed, dy, nloc, nnei,
@@ -559,15 +531,10 @@ class TabulateFusionSeTOp : public OpKernel {
     const int nnei_j = em_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
-                                       nloc, nnei_i, nnei_j, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em,
                                        nloc, nnei_i, nnei_j, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em,
                                        nloc, nnei_i, nnei_j, last_layer_size);
@@ -627,17 +594,11 @@ class TabulateFusionSeTGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
-                                            em_x, em, dy, nloc, nnei_i, nnei_j,
-                                            last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, dy, nloc, nnei_i, nnei_j,
                                             last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info,
                                             em_x, em, dy, nloc, nnei_i, nnei_j,
@@ -690,16 +651,11 @@ class TabulateFusionSeTGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_t_grad_grad_gpu(
-          dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
-          nnei_i, nnei_j, last_layer_size);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_t_grad_grad_gpu(
           dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc,
           nnei_i, nnei_j, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
@@ -758,15 +714,10 @@ class TabulateFusionSeROp : public OpKernel {
     const int nnei = em_tensor.shape().dim_size(1);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
-                                       nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc,
                                        nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc,
                                        nnei, last_layer_size);
@@ -818,15 +769,10 @@ class TabulateFusionSeRGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
-                                            nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy,
                                             nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
     } else if (device == "CPU") {
       deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy,
                                             nloc, nnei, last_layer_size);
@@ -871,14 +817,10 @@ class TabulateFusionSeRGradGradOp : public OpKernel {
     const int last_layer_size = descriptor_tensor.shape().dim_size(2);
 
     if (device == "GPU") {
-#if GOOGLE_CUDA
-      deepmd::tabulate_fusion_se_r_grad_grad_gpu(
-          dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
-#endif  // GOOGLE_CUDA
-#if TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       deepmd::tabulate_fusion_se_r_grad_grad_gpu(
           dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
       OP_REQUIRES(context, (last_layer_size <= 1024),
                   errors::InvalidArgument(
                       "In the process of model compression, the size of the "
diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc
index bc67a9fac9..9a61a3bac9 100644
--- a/source/op/unaggregated_grad.cc
+++ b/source/op/unaggregated_grad.cc
@@ -490,7 +490,7 @@ REGISTER_CPU(float);
 REGISTER_CPU(double);
 // Not required in the current situation
 // // Register the GPU kernels.
-// #if GOOGLE_CUDA
+// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // #define REGISTER_GPU(T) \
 // REGISTER_KERNEL_BUILDER( \
 //     Name("UnaggregatedDyDxS").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
@@ -500,4 +500,4 @@ REGISTER_CPU(double);
 //     UnaggregatedDyDxOp<GPUDevice, T>);
 // REGISTER_GPU(float);
 // REGISTER_GPU(double);
-// #endif  // GOOGLE_CUDA
+// #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM