From f8a6c8f20ceb4233a8b5cc70ccee009c5df18f80 Mon Sep 17 00:00:00 2001
From: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
Date: Tue, 19 Sep 2023 20:59:48 -0400
Subject: [PATCH] merge CUDA and ROCm in header files

Signed-off-by: Jinzhe Zeng <jinzhe.zeng@rutgers.edu>
---
 source/lib/include/coord.h              |  45 +---------
 source/lib/include/fmt_nlist.h          |  28 +-----
 source/lib/include/gelu.h               |  23 +----
 source/lib/include/neighbor_list.h      |  46 +---------
 source/lib/include/prod_env_mat.h       |  52 +----------
 source/lib/include/prod_env_mat_nvnmd.h |   8 +-
 source/lib/include/prod_force.h         |  26 +-----
 source/lib/include/prod_force_grad.h    |  23 +----
 source/lib/include/prod_virial.h        |  28 +-----
 source/lib/include/prod_virial_grad.h   |  24 +----
 source/lib/include/region.h             |  19 +---
 source/lib/include/tabulate.h           | 112 +-----------------------
 12 files changed, 24 insertions(+), 410 deletions(-)
diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h
index fb60f6440b..699a90898c 100644
--- a/source/lib/include/coord.h
+++ b/source/lib/include/coord.h
@@ -44,7 +44,7 @@ void compute_cell_info(int* cell_info,
                        const float& rcut,
                        const deepmd::Region<FPTYPE>& region);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // normalize coords
 // output:
 // coord
@@ -83,47 +83,6 @@ int copy_coord_gpu(FPTYPE* out_c,
                    const int& total_cellnum,
                    const int* cell_info,
                    const deepmd::Region<FPTYPE>& region);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// normalize coords
-// output:
-// coord
-// input:
-// natom, box_info: boxt, rec_boxt
-template <typename FPTYPE>
-void normalize_coord_gpu(FPTYPE* coord,
-                         const int natom,
-                         const deepmd::Region<FPTYPE>& region);
-
-// copy coordinates
-// outputs:
-//	out_c, out_t, mapping, nall,
-//  int_data(temp cuda
-//  memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
-//                             cell_map,cell_shift_map,sec_loc_cellnum_map,sec_total_cellnum_map,loc_clist
-// inputs:
-//	in_c, in_t, nloc, mem_nall, loc_cellnum, total_cellnum, cell_info,
-// box_info 	mem_nall is the size of allocated memory for out_c, out_t,
-// mapping
-// returns
-//	0: succssful
-//	1: the memory is not large enough to hold all copied coords and types.
-//	   i.e. nall > mem_nall
-template <typename FPTYPE>
-int copy_coord_gpu(FPTYPE* out_c,
-                   int* out_t,
-                   int* mapping,
-                   int* nall,
-                   int* int_data,
-                   const FPTYPE* in_c,
-                   const int* in_t,
-                   const int& nloc,
-                   const int& mem_nall,
-                   const int& loc_cellnum,
-                   const int& total_cellnum,
-                   const int* cell_info,
-                   const deepmd::Region<FPTYPE>& region);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h
index 1e7c6574cc..18cb319304 100644
--- a/source/lib/include/fmt_nlist.h
+++ b/source/lib/include/fmt_nlist.h
@@ -18,7 +18,7 @@ void format_nlist_cpu(int* nlist,
                       const float rcut,
                       const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void format_nbor_list_gpu(int* nlist,
                           const FPTYPE* coord,
@@ -40,31 +40,7 @@ void test_encoding_decoding_nbor_info_gpu(uint_64* key,
                                           const FPTYPE* in_dist,
                                           const int* in_index,
                                           const int size_of_array);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void format_nbor_list_gpu(int* nlist,
-                          const FPTYPE* coord,
-                          const int* type,
-                          const deepmd::InputNlist& gpu_inlist,
-                          int* array_int,
-                          uint_64* array_longlong,
-                          const int max_nbor_size,
-                          const int nloc,
-                          const int nall,
-                          const float rcut,
-                          const std::vector<int> sec);
-
-template <typename FPTYPE>
-void test_encoding_decoding_nbor_info_gpu(uint_64* key,
-                                          int* out_type,
-                                          int* out_index,
-                                          const int* in_type,
-                                          const FPTYPE* in_dist,
-                                          const int* in_index,
-                                          const int size_of_array);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
 
diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h
index 946c283c8d..013d4ef02b 100644
--- a/source/lib/include/gelu.h
+++ b/source/lib/include/gelu.h
@@ -20,7 +20,7 @@ void gelu_grad_grad_cpu(FPTYPE* out,
                         const FPTYPE* dy_2,
                         const int_64 size);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
 
@@ -36,24 +36,5 @@ void gelu_grad_grad_gpu(FPTYPE* out,
                         const FPTYPE* dy,
                         const FPTYPE* dy_2,
                         const int_64 size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size);
-
-template <typename FPTYPE>
-void gelu_grad_gpu(FPTYPE* out,
-                   const FPTYPE* xx,
-                   const FPTYPE* dy,
-                   const int_64 size);
-
-template <typename FPTYPE>
-void gelu_grad_grad_gpu(FPTYPE* out,
-                        const FPTYPE* xx,
-                        const FPTYPE* dy,
-                        const FPTYPE* dy_2,
-                        const int_64 size);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h
index 5ed2dd4501..eb510eb25b 100644
--- a/source/lib/include/neighbor_list.h
+++ b/source/lib/include/neighbor_list.h
@@ -121,7 +121,7 @@ void use_nlist_map(int* nlist,
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // build neighbor list.
 // outputs
 //	nlist, max_list_size
@@ -162,49 +162,7 @@ void use_nei_info_gpu(int* nlist,
                       const int ntypes,
                       const bool b_nlist_map);
 
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// build neighbor list.
-// outputs
-//	nlist, max_list_size
-//	max_list_size is the maximal size of jlist.
-// inputs
-//	c_cpy, nloc, nall, mem_size, rcut, region
-//	mem_size is the size of allocated memory for jlist.
-// returns
-//	0: succssful
-//	1: the memory is not large enough to hold all neighbors.
-//	   i.e. max_list_size > mem_nall
-template <typename FPTYPE>
-int build_nlist_gpu(InputNlist& nlist,
-                    int* max_list_size,
-                    int* nlist_data,
-                    const FPTYPE* c_cpy,
-                    const int& nloc,
-                    const int& nall,
-                    const int& mem_size,
-                    const float& rcut);
-/**
- * @brief Filter the fake atom type.
- * @details If >=0, set to 0; if <0, set to -1.
- * @param ftype_out The output filtered atom type.
- * @param ftype_in The input atom type.
- * @param nloc The number of atoms.
- */
-void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc);
-
-void use_nei_info_gpu(int* nlist,
-                      int* ntype,
-                      bool* nmask,
-                      const int* type,
-                      const int* nlist_map,
-                      const int nloc,
-                      const int nnei,
-                      const int ntypes,
-                      const bool b_nlist_map);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
 
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 91f09f74e7..60da638d68 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -42,7 +42,7 @@ void prod_env_mat_r_cpu(FPTYPE *em,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_env_mat_a_gpu(FPTYPE *em,
                         FPTYPE *em_deriv,
@@ -88,54 +88,6 @@ void env_mat_nbor_update(InputNlist &inlist,
                          int *&nbor_list_dev,
                          const int *mesh,
                          const int size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_env_mat_a_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec,
-                        const int *f_type = NULL);
-
-template <typename FPTYPE>
-void prod_env_mat_r_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
-                        const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
-                        const int nloc,
-                        const int nall,
-                        const float rcut,
-                        const float rcut_smth,
-                        const std::vector<int> sec);
-
-void env_mat_nbor_update(InputNlist &inlist,
-                         InputNlist &gpu_inlist,
-                         int &max_nbor_size,
-                         int *&nbor_list_dev,
-                         const int *mesh,
-                         const int size);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_env_mat_nvnmd.h b/source/lib/include/prod_env_mat_nvnmd.h
index df70423021..c0a7e32cc4 100644
--- a/source/lib/include/prod_env_mat_nvnmd.h
+++ b/source/lib/include/prod_env_mat_nvnmd.h
@@ -45,12 +45,8 @@ void prod_env_mat_a_nvnmd_quantize_cpu(FPTYPE* em,
                                        const std::vector<int> sec,
                                        const int* f_type = NULL);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-// UNDEFINE
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h
index 03c72ba661..b5ae68bdce 100644
--- a/source/lib/include/prod_force.h
+++ b/source/lib/include/prod_force.h
@@ -67,7 +67,7 @@ void prod_force_r_cpu(FPTYPE* force,
                       const int nnei,
                       const int nframes);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_force_a_gpu(FPTYPE* force,
                       const FPTYPE* net_deriv,
@@ -87,28 +87,6 @@ void prod_force_r_gpu(FPTYPE* force,
                       const int nall,
                       const int nnei,
                       const int nframes);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_force_a_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes);
-
-template <typename FPTYPE>
-void prod_force_r_gpu(FPTYPE* force,
-                      const FPTYPE* net_deriv,
-                      const FPTYPE* in_deriv,
-                      const int* nlist,
-                      const int nloc,
-                      const int nall,
-                      const int nnei,
-                      const int nframes);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h
index 5d0ab50b68..737d54001d 100644
--- a/source/lib/include/prod_force_grad.h
+++ b/source/lib/include/prod_force_grad.h
@@ -21,7 +21,7 @@ void prod_force_grad_r_cpu(FPTYPE* grad_net,
                            const int nnei,
                            const int nframes);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_force_grad_a_gpu(FPTYPE* grad_net,
                            const FPTYPE* grad,
@@ -39,25 +39,6 @@ void prod_force_grad_r_gpu(FPTYPE* grad_net,
                            const int nloc,
                            const int nnei,
                            const int nframes);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_force_grad_a_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes);
-
-template <typename FPTYPE>
-void prod_force_grad_r_gpu(FPTYPE* grad_net,
-                           const FPTYPE* grad,
-                           const FPTYPE* env_deriv,
-                           const int* nlist,
-                           const int nloc,
-                           const int nnei,
-                           const int nframes);
-#endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h
index 348188874c..d42b547d32 100644
--- a/source/lib/include/prod_virial.h
+++ b/source/lib/include/prod_virial.h
@@ -25,7 +25,7 @@ void prod_virial_r_cpu(FPTYPE* virial,
                        const int nall,
                        const int nnei);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_virial_a_gpu(FPTYPE* virial,
                        FPTYPE* atom_virial,
@@ -47,30 +47,6 @@ void prod_virial_r_gpu(FPTYPE* virial,
                        const int nloc,
                        const int nall,
                        const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_a_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_r_gpu(FPTYPE* virial,
-                       FPTYPE* atom_virial,
-                       const FPTYPE* net_deriv,
-                       const FPTYPE* env_deriv,
-                       const FPTYPE* rij,
-                       const int* nlist,
-                       const int nloc,
-                       const int nall,
-                       const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h
index 6e0c232f8a..eda98f9634 100644
--- a/source/lib/include/prod_virial_grad.h
+++ b/source/lib/include/prod_virial_grad.h
@@ -21,7 +21,7 @@ void prod_virial_grad_r_cpu(FPTYPE* grad_net,
                             const int nloc,
                             const int nnei);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void prod_virial_grad_a_gpu(FPTYPE* grad_net,
                             const FPTYPE* grad,
@@ -39,26 +39,6 @@ void prod_virial_grad_r_gpu(FPTYPE* grad_net,
                             const int* nlist,
                             const int nloc,
                             const int nnei);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void prod_virial_grad_a_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-
-template <typename FPTYPE>
-void prod_virial_grad_r_gpu(FPTYPE* grad_net,
-                            const FPTYPE* grad,
-                            const FPTYPE* env_deriv,
-                            const FPTYPE* rij,
-                            const int* nlist,
-                            const int nloc,
-                            const int nnei);
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace deepmd
diff --git a/source/lib/include/region.h b/source/lib/include/region.h
index 9db2735462..2f6dbbf4e0 100644
--- a/source/lib/include/region.h
+++ b/source/lib/include/region.h
@@ -27,7 +27,7 @@ void convert_to_phys_cpu(FPTYPE* rp,
                          const Region<FPTYPE>& region,
                          const FPTYPE* ri);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // only for unittest
 template <typename FPTYPE>
 void convert_to_inter_gpu(FPTYPE* ri,
@@ -41,21 +41,6 @@ void convert_to_phys_gpu(FPTYPE* rp,
 
 template <typename FPTYPE>
 void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-#if TENSORFLOW_USE_ROCM
-// only for unittest
-template <typename FPTYPE>
-void convert_to_inter_gpu(FPTYPE* ri,
-                          const Region<FPTYPE>& region,
-                          const FPTYPE* rp);
-
-template <typename FPTYPE>
-void convert_to_phys_gpu(FPTYPE* rp,
-                         const Region<FPTYPE>& region,
-                         const FPTYPE* ri);
-
-template <typename FPTYPE>
-void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region);
-#endif  // TENSORFLOW_USE_ROCM
 }  // namespace deepmd
diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h
index 96072e6a33..76a46bbe6c 100644
--- a/source/lib/include/tabulate.h
+++ b/source/lib/include/tabulate.h
@@ -108,7 +108,7 @@ void tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy,
                                         const int nnei,
                                         const int last_layer_size);
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
 void tabulate_fusion_se_a_gpu(FPTYPE* out,
                               const FPTYPE* table,
@@ -213,113 +213,5 @@ void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
                                         const int nloc,
                                         const int nnei,
                                         const int last_layer_size);
-#endif  // GOOGLE_CUDA
-
-#if TENSORFLOW_USE_ROCM
-template <typename FPTYPE>
-void tabulate_fusion_se_a_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const FPTYPE* two_embed,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size,
-                              const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* two_embed,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size,
-                                   const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size,
-                                        const bool is_sorted = true);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em_x,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei_i,
-                              const int nnei_j,
-                              const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x,
-                                   FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em_x,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei_i,
-                                   const int nnei_j,
-                                   const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em_x,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem_x,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei_i,
-                                        const int nnei_j,
-                                        const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_gpu(FPTYPE* out,
-                              const FPTYPE* table,
-                              const FPTYPE* table_info,
-                              const FPTYPE* em,
-                              const int nloc,
-                              const int nnei,
-                              const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem,
-                                   const FPTYPE* table,
-                                   const FPTYPE* table_info,
-                                   const FPTYPE* em,
-                                   const FPTYPE* dy,
-                                   const int nloc,
-                                   const int nnei,
-                                   const int last_layer_size);
-
-template <typename FPTYPE>
-void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy,
-                                        const FPTYPE* table,
-                                        const FPTYPE* table_info,
-                                        const FPTYPE* em,
-                                        const FPTYPE* dz_dy_dem,
-                                        const int nloc,
-                                        const int nnei,
-                                        const int last_layer_size);
-
-#endif  // TENSORFLOW_USE_ROCM
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 }  // namespace deepmd