diff --git a/source/lib/include/coord.h b/source/lib/include/coord.h index 56d90fbb17..fb60f6440b 100644 --- a/source/lib/include/coord.h +++ b/source/lib/include/coord.h @@ -92,9 +92,9 @@ int copy_coord_gpu(FPTYPE* out_c, // input: // natom, box_info: boxt, rec_boxt template -void normalize_coord_gpu_rocm(FPTYPE* coord, - const int natom, - const deepmd::Region& region); +void normalize_coord_gpu(FPTYPE* coord, + const int natom, + const deepmd::Region& region); // copy coordinates // outputs: @@ -111,19 +111,19 @@ void normalize_coord_gpu_rocm(FPTYPE* coord, // 1: the memory is not large enough to hold all copied coords and types. // i.e. nall > mem_nall template -int copy_coord_gpu_rocm(FPTYPE* out_c, - int* out_t, - int* mapping, - int* nall, - int* int_data, - const FPTYPE* in_c, - const int* in_t, - const int& nloc, - const int& mem_nall, - const int& loc_cellnum, - const int& total_cellnum, - const int* cell_info, - const deepmd::Region& region); +int copy_coord_gpu(FPTYPE* out_c, + int* out_t, + int* mapping, + int* nall, + int* int_data, + const FPTYPE* in_c, + const int* in_t, + const int& nloc, + const int& mem_nall, + const int& loc_cellnum, + const int& total_cellnum, + const int* cell_info, + const deepmd::Region& region); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/fmt_nlist.h b/source/lib/include/fmt_nlist.h index 60e34c7da9..1e7c6574cc 100644 --- a/source/lib/include/fmt_nlist.h +++ b/source/lib/include/fmt_nlist.h @@ -20,50 +20,50 @@ void format_nlist_cpu(int* nlist, #if GOOGLE_CUDA template -void format_nbor_list_gpu_cuda(int* nlist, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); +void format_nbor_list_gpu(int* nlist, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); template -void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array); +void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const FPTYPE* in_dist, + const int* in_index, + const int size_of_array); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void format_nbor_list_gpu_rocm(int* nlist, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); +void format_nbor_list_gpu(int* nlist, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); template -void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array); +void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const FPTYPE* in_dist, + const int* in_index, + const int size_of_array); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/gelu.h b/source/lib/include/gelu.h index a3985ce0cc..946c283c8d 100644 --- a/source/lib/include/gelu.h +++ b/source/lib/include/gelu.h @@ -22,38 +22,38 @@ void gelu_grad_grad_cpu(FPTYPE* out, #if GOOGLE_CUDA template -void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size); +void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size); template -void gelu_grad_gpu_cuda(FPTYPE* out, +void gelu_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const int_64 size); + +template +void gelu_grad_grad_gpu(FPTYPE* out, const FPTYPE* xx, const FPTYPE* dy, + const FPTYPE* dy_2, const int_64 size); - -template -void gelu_grad_grad_gpu_cuda(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size); +void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size); template -void gelu_grad_gpu_rocm(FPTYPE* out, +void gelu_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const int_64 size); + +template +void gelu_grad_grad_gpu(FPTYPE* out, const FPTYPE* xx, const FPTYPE* dy, + const FPTYPE* dy_2, const int_64 size); -template -void gelu_grad_grad_gpu_rocm(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size); - #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/neighbor_list.h b/source/lib/include/neighbor_list.h index 4e0ce4f2de..5ed2dd4501 100644 --- a/source/lib/include/neighbor_list.h +++ b/source/lib/include/neighbor_list.h @@ -150,7 +150,7 @@ int build_nlist_gpu(InputNlist& nlist, * @param ftype_in The input atom type. * @param nloc The number of atoms. */ -void filter_ftype_gpu_cuda(int* ftype_out, const int* ftype_in, const int nloc); +void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc); void use_nei_info_gpu(int* nlist, int* ntype, @@ -177,14 +177,14 @@ void use_nei_info_gpu(int* nlist, // 1: the memory is not large enough to hold all neighbors. // i.e. max_list_size > mem_nall template -int build_nlist_gpu_rocm(InputNlist& nlist, - int* max_list_size, - int* nlist_data, - const FPTYPE* c_cpy, - const int& nloc, - const int& nall, - const int& mem_size, - const float& rcut); +int build_nlist_gpu(InputNlist& nlist, + int* max_list_size, + int* nlist_data, + const FPTYPE* c_cpy, + const int& nloc, + const int& nall, + const int& mem_size, + const float& rcut); /** * @brief Filter the fake atom type. * @details If >=0, set to 0; if <0, set to -1. @@ -192,17 +192,17 @@ int build_nlist_gpu_rocm(InputNlist& nlist, * @param ftype_in The input atom type. * @param nloc The number of atoms. */ -void filter_ftype_gpu_rocm(int* ftype_out, const int* ftype_in, const int nloc); +void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc); -void use_nei_info_gpu_rocm(int* nlist, - int* ntype, - bool* nmask, - const int* type, - const int* nlist_map, - const int nloc, - const int nnei, - const int ntypes, - const bool b_nlist_map); +void use_nei_info_gpu(int* nlist, + int* ntype, + bool* nmask, + const int* type, + const int* nlist_map, + const int nloc, + const int nnei, + const int ntypes, + const bool b_nlist_map); #endif // TENSORFLOW_USE_ROCM diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h index a1cd27bef0..91f09f74e7 100644 --- a/source/lib/include/prod_env_mat.h +++ b/source/lib/include/prod_env_mat.h @@ -44,43 +44,43 @@ void prod_env_mat_r_cpu(FPTYPE *em, #if GOOGLE_CUDA template -void prod_env_mat_a_gpu_cuda(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type = NULL); +void prod_env_mat_a_gpu(FPTYPE *em, + FPTYPE *em_deriv, + FPTYPE *rij, + int *nlist, + const FPTYPE *coord, + const int *type, + const InputNlist &gpu_inlist, + int *array_int, + unsigned long long *array_longlong, + const int max_nbor_size, + const FPTYPE *avg, + const FPTYPE *std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int *f_type = NULL); template -void prod_env_mat_r_gpu_cuda(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); +void prod_env_mat_r_gpu(FPTYPE *em, + FPTYPE *em_deriv, + FPTYPE *rij, + int *nlist, + const FPTYPE *coord, + const int *type, + const InputNlist &gpu_inlist, + int *array_int, + unsigned long long *array_longlong, + const int max_nbor_size, + const FPTYPE *avg, + const FPTYPE *std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); void env_mat_nbor_update(InputNlist &inlist, InputNlist &gpu_inlist, @@ -92,43 +92,43 @@ void env_mat_nbor_update(InputNlist &inlist, #if TENSORFLOW_USE_ROCM template -void prod_env_mat_a_gpu_rocm(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type = NULL); +void prod_env_mat_a_gpu(FPTYPE *em, + FPTYPE *em_deriv, + FPTYPE *rij, + int *nlist, + const FPTYPE *coord, + const int *type, + const InputNlist &gpu_inlist, + int *array_int, + unsigned long long *array_longlong, + const int max_nbor_size, + const FPTYPE *avg, + const FPTYPE *std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int *f_type = NULL); template -void prod_env_mat_r_gpu_rocm(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); +void prod_env_mat_r_gpu(FPTYPE *em, + FPTYPE *em_deriv, + FPTYPE *rij, + int *nlist, + const FPTYPE *coord, + const int *type, + const InputNlist &gpu_inlist, + int *array_int, + unsigned long long *array_longlong, + const int max_nbor_size, + const FPTYPE *avg, + const FPTYPE *std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); void env_mat_nbor_update(InputNlist &inlist, InputNlist &gpu_inlist, diff --git a/source/lib/include/prod_force.h b/source/lib/include/prod_force.h index ce3e020a3b..03c72ba661 100644 --- a/source/lib/include/prod_force.h +++ b/source/lib/include/prod_force.h @@ -69,46 +69,46 @@ void prod_force_r_cpu(FPTYPE* force, #if GOOGLE_CUDA template -void prod_force_a_gpu_cuda(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +void prod_force_a_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); template -void prod_force_r_gpu_cuda(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +void prod_force_r_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void prod_force_a_gpu_rocm(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +void prod_force_a_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); template -void prod_force_r_gpu_rocm(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +void prod_force_r_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/prod_force_grad.h b/source/lib/include/prod_force_grad.h index 4d224ad93f..5d0ab50b68 100644 --- a/source/lib/include/prod_force_grad.h +++ b/source/lib/include/prod_force_grad.h @@ -23,41 +23,41 @@ void prod_force_grad_r_cpu(FPTYPE* grad_net, #if GOOGLE_CUDA template -void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +void prod_force_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); template -void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +void prod_force_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +void prod_force_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); template -void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +void prod_force_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/prod_virial.h b/source/lib/include/prod_virial.h index 46e0ef3ab9..348188874c 100644 --- a/source/lib/include/prod_virial.h +++ b/source/lib/include/prod_virial.h @@ -27,50 +27,50 @@ void prod_virial_r_cpu(FPTYPE* virial, #if GOOGLE_CUDA template -void prod_virial_a_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +void prod_virial_a_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); template -void prod_virial_r_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +void prod_virial_r_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void prod_virial_a_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +void prod_virial_a_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); template -void prod_virial_r_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +void prod_virial_r_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/prod_virial_grad.h b/source/lib/include/prod_virial_grad.h index d840e6b718..6e0c232f8a 100644 --- a/source/lib/include/prod_virial_grad.h +++ b/source/lib/include/prod_virial_grad.h @@ -23,42 +23,42 @@ void prod_virial_grad_r_cpu(FPTYPE* grad_net, #if GOOGLE_CUDA template -void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); +void prod_virial_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei); template -void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); +void prod_virial_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); +void prod_virial_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei); template -void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); +void prod_virial_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/region.h b/source/lib/include/region.h index 6aaf805ccd..9db2735462 100644 --- a/source/lib/include/region.h +++ b/source/lib/include/region.h @@ -46,16 +46,16 @@ void volume_gpu(FPTYPE* volume, const Region& region); #if TENSORFLOW_USE_ROCM // only for unittest template -void convert_to_inter_gpu_rocm(FPTYPE* ri, - const Region& region, - const FPTYPE* rp); +void convert_to_inter_gpu(FPTYPE* ri, + const Region& region, + const FPTYPE* rp); template -void convert_to_phys_gpu_rocm(FPTYPE* rp, - const Region& region, - const FPTYPE* ri); +void convert_to_phys_gpu(FPTYPE* rp, + const Region& region, + const FPTYPE* ri); template -void volume_gpu_rocm(FPTYPE* volume, const Region& region); +void volume_gpu(FPTYPE* volume, const Region& region); #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/include/tabulate.h b/source/lib/include/tabulate.h index 2e2c021d9c..96072e6a33 100644 --- a/source/lib/include/tabulate.h +++ b/source/lib/include/tabulate.h @@ -110,216 +110,216 @@ void tabulate_fusion_se_r_grad_grad_cpu(FPTYPE* dz_dy, #if GOOGLE_CUDA template -void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out, +void tabulate_fusion_se_a_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted = true); + +template +void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, const FPTYPE* two_embed, + const FPTYPE* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted = true); template -void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, +void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted = true); template -void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted = true); +void tabulate_fusion_se_t_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); template -void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out, +void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, + const FPTYPE* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); template -void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, +void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); template -void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); +void tabulate_fusion_se_r_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const int nloc, + const int nnei, + const int last_layer_size); template -void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out, +void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em, + const FPTYPE* dy, const int nloc, const int nnei, const int last_layer_size); template -void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem, +void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); - -template -void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM template -void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, +void tabulate_fusion_se_a_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted = true); + +template +void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, const FPTYPE* two_embed, + const FPTYPE* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted = true); template -void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, +void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted = true); template -void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted = true); +void tabulate_fusion_se_t_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); template -void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out, +void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, + const FPTYPE* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); template -void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, +void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em_x, const FPTYPE* em, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); template -void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); +void tabulate_fusion_se_r_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const int nloc, + const int nnei, + const int last_layer_size); template -void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out, +void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em, + const FPTYPE* dy, const int nloc, const int nnei, const int last_layer_size); template -void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem, +void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, const FPTYPE* table, const FPTYPE* table_info, const FPTYPE* em, - const FPTYPE* dy, + const FPTYPE* dz_dy_dem, const int nloc, const int nnei, const int last_layer_size); -template -void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size); - #endif // TENSORFLOW_USE_ROCM } // namespace deepmd diff --git a/source/lib/src/cuda/gelu.cu b/source/lib/src/cuda/gelu.cu index af78043cca..823a843b2a 100644 --- a/source/lib/src/cuda/gelu.cu +++ b/source/lib/src/cuda/gelu.cu @@ -63,7 +63,7 @@ __global__ void gelu_grad_grad(FPTYPE* out, namespace deepmd { template -void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) { +void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) { if (size <= 0) { return; } @@ -78,10 +78,10 @@ void gelu_gpu_cuda(FPTYPE* out, const FPTYPE* xx, const int_64 size) { } template -void gelu_grad_gpu_cuda(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const int_64 size) { +void gelu_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const int_64 size) { if (size <= 0) { return; } @@ -96,11 +96,11 @@ void gelu_grad_gpu_cuda(FPTYPE* out, } template -void gelu_grad_grad_gpu_cuda(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size) { +void gelu_grad_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const FPTYPE* dy_2, + const int_64 size) { if (size <= 0) { return; } @@ -114,28 +114,24 @@ void gelu_grad_grad_gpu_cuda(FPTYPE* out, DPErrcheck(cudaDeviceSynchronize()); } -template void gelu_gpu_cuda(float* out, +template void gelu_gpu(float* out, const float* x, const int_64 size); +template void gelu_gpu(double* out, const double* x, const int_64 size); +template void gelu_grad_gpu(float* out, const float* x, + const float* dy, const int_64 size); -template void gelu_gpu_cuda(double* out, +template void gelu_grad_gpu(double* out, const double* x, + const double* dy, const int_64 size); -template void gelu_grad_gpu_cuda(float* out, +template void gelu_grad_grad_gpu(float* out, const float* x, const float* dy, + const float* dy_2, const int_64 size); -template void gelu_grad_gpu_cuda(double* out, +template void gelu_grad_grad_gpu(double* out, const double* x, const double* dy, + const double* dy_2, const int_64 size); -template void gelu_grad_grad_gpu_cuda(float* out, - const float* x, - const float* dy, - const float* dy_2, - const int_64 size); -template void gelu_grad_grad_gpu_cuda(double* out, - const double* x, - const double* dy, - const double* dy_2, - const int_64 size); } // namespace deepmd diff --git a/source/lib/src/cuda/neighbor_list.cu b/source/lib/src/cuda/neighbor_list.cu index 4fae6f3874..7cac07690b 100644 --- a/source/lib/src/cuda/neighbor_list.cu +++ b/source/lib/src/cuda/neighbor_list.cu @@ -294,9 +294,7 @@ __global__ void map_filter_ftype(int *ftype_out, } } -void filter_ftype_gpu_cuda(int *ftype_out, - const int *ftype_in, - const int nloc) { +void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); int nblock = (nloc + TPB - 1) / TPB; diff --git a/source/lib/src/cuda/prod_env_mat.cu b/source/lib/src/cuda/prod_env_mat.cu index 8a085a47b5..e603b25db7 100644 --- a/source/lib/src/cuda/prod_env_mat.cu +++ b/source/lib/src/cuda/prod_env_mat.cu @@ -558,17 +558,17 @@ __global__ void compute_env_mat_r(FPTYPE* em, namespace deepmd { template -void format_nbor_list_gpu_cuda(int* nlist, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec) { +void format_nbor_list_gpu(int* nlist, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int LEN = 256; @@ -613,24 +613,24 @@ void format_nbor_list_gpu_cuda(int* nlist, } template -void prod_env_mat_a_gpu_cuda(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type) { +void prod_env_mat_a_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); if (f_type == NULL) { @@ -643,9 +643,8 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em, cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - format_nbor_list_gpu_cuda(nlist, coord, f_type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, - sec); + format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int, + array_longlong, max_nbor_size, nloc, nall, rcut, sec); nborErrcheck(cudaGetLastError()); nborErrcheck(cudaDeviceSynchronize()); @@ -656,23 +655,23 @@ void prod_env_mat_a_gpu_cuda(FPTYPE* em, } template -void prod_env_mat_r_gpu_cuda(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec) { +void prod_env_mat_r_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int nnei = sec.back(); @@ -682,9 +681,8 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em, cudaMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); DPErrcheck(cudaMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - format_nbor_list_gpu_cuda(nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, - sec); + format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, + array_longlong, max_nbor_size, nloc, nall, rcut, sec); nborErrcheck(cudaGetLastError()); nborErrcheck(cudaDeviceSynchronize()); @@ -695,13 +693,13 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em, } template -void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array) { +void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const FPTYPE* in_dist, + const int* in_index, + const int size_of_array) { const int nblock = (size_of_array + TPB - 1) / TPB; encoding_decoding_nbor_info<<>>( key, out_type, out_index, in_type, in_dist, in_index, size_of_array); @@ -709,116 +707,110 @@ void test_encoding_decoding_nbor_info_gpu_cuda(uint_64* key, DPErrcheck(cudaDeviceSynchronize()); } -template void prod_env_mat_a_gpu_cuda(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_a_gpu_cuda( - double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_r_gpu_cuda(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void prod_env_mat_r_gpu_cuda( - double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void format_nbor_list_gpu_cuda( - int* nlist, - const float* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void format_nbor_list_gpu_cuda( - int* nlist, - const double* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void test_encoding_decoding_nbor_info_gpu_cuda( - uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const float* in_dist, - const int* in_index, - const int size_of_array); -template void test_encoding_decoding_nbor_info_gpu_cuda( - uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const double* in_dist, - const int* in_index, - const int size_of_array); +template void prod_env_mat_a_gpu(float* em, + float* em_deriv, + float* rij, + int* nlist, + const float* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const float* avg, + const float* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type); +template void prod_env_mat_a_gpu(double* em, + double* em_deriv, + double* rij, + int* nlist, + const double* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const double* avg, + const double* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type); +template void prod_env_mat_r_gpu(float* em, + float* em_deriv, + float* rij, + int* nlist, + const float* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const float* avg, + const float* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); +template void prod_env_mat_r_gpu(double* em, + double* em_deriv, + double* rij, + int* nlist, + const double* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const double* avg, + const double* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); +template void format_nbor_list_gpu(int* nlist, + const float* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); +template void format_nbor_list_gpu(int* nlist, + const double* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); +template void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const float* in_dist, + const int* in_index, + const int size_of_array); +template void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const double* in_dist, + const int* in_index, + const int size_of_array); } // namespace deepmd diff --git a/source/lib/src/cuda/prod_force.cu b/source/lib/src/cuda/prod_force.cu index 04f5b84dcd..d85de26394 100644 --- a/source/lib/src/cuda/prod_force.cu +++ b/source/lib/src/cuda/prod_force.cu @@ -102,14 +102,14 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, namespace deepmd { template -void prod_force_a_gpu_cuda(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { +void prod_force_a_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei * 4; @@ -131,14 +131,14 @@ void prod_force_a_gpu_cuda(FPTYPE* force, } template -void prod_force_r_gpu_cuda(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { +void prod_force_r_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei * 1; @@ -159,36 +159,36 @@ void prod_force_r_gpu_cuda(FPTYPE* force, DPErrcheck(cudaDeviceSynchronize()); } -template void prod_force_a_gpu_cuda(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_a_gpu_cuda(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu_cuda(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu_cuda(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +template void prod_force_a_gpu(float* force, + const float* net_deriv, + const float* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_a_gpu(double* force, + const double* net_deriv, + const double* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_r_gpu(float* force, + const float* net_deriv, + const float* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_r_gpu(double* force, + const double* net_deriv, + const double* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/cuda/prod_force_grad.cu b/source/lib/src/cuda/prod_force_grad.cu index e72ba2ea48..b54676586c 100644 --- a/source/lib/src/cuda/prod_force_grad.cu +++ b/source/lib/src/cuda/prod_force_grad.cu @@ -81,13 +81,13 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, namespace deepmd { template -void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { +void prod_force_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei * 4; @@ -112,13 +112,13 @@ void prod_force_grad_a_gpu_cuda(FPTYPE* grad_net, } template -void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { +void prod_force_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei * 1; @@ -142,32 +142,32 @@ void prod_force_grad_r_gpu_cuda(FPTYPE* grad_net, DPErrcheck(cudaDeviceSynchronize()); } -template void prod_force_grad_a_gpu_cuda(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_a_gpu_cuda(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu_cuda(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu_cuda(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +template void prod_force_grad_a_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_a_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_r_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_r_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/cuda/prod_virial.cu b/source/lib/src/cuda/prod_virial.cu index 618f82625d..e96bacf1d3 100644 --- a/source/lib/src/cuda/prod_virial.cu +++ b/source/lib/src/cuda/prod_virial.cu @@ -104,15 +104,15 @@ __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, namespace deepmd { template -void prod_virial_a_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { +void prod_virial_a_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); @@ -134,15 +134,15 @@ void prod_virial_a_gpu_cuda(FPTYPE* virial, } template -void prod_virial_r_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { +void prod_virial_r_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); @@ -163,40 +163,40 @@ void prod_virial_r_gpu_cuda(FPTYPE* virial, DPErrcheck(cudaDeviceSynchronize()); } -template void prod_virial_a_gpu_cuda(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_a_gpu_cuda(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu_cuda(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu_cuda(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +template void prod_virial_a_gpu(float* virial, + float* atom_virial, + const float* net_deriv, + const float* in_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_a_gpu(double* virial, + double* atom_virial, + const double* net_deriv, + const double* in_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_r_gpu(float* virial, + float* atom_virial, + const float* net_deriv, + const float* in_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_r_gpu(double* virial, + double* atom_virial, + const double* net_deriv, + const double* in_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); } // namespace deepmd diff --git a/source/lib/src/cuda/prod_virial_grad.cu b/source/lib/src/cuda/prod_virial_grad.cu index aae7676d3c..047d8ae17f 100644 --- a/source/lib/src/cuda/prod_virial_grad.cu +++ b/source/lib/src/cuda/prod_virial_grad.cu @@ -85,13 +85,13 @@ __global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net, namespace deepmd { template -void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { +void prod_virial_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei * 4; @@ -107,13 +107,13 @@ void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, } template -void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { +void prod_virial_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); const int ndescrpt = nnei; @@ -128,32 +128,32 @@ void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net, DPErrcheck(cudaDeviceSynchronize()); } -template void prod_virial_grad_a_gpu_cuda(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_a_gpu_cuda(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu_cuda(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu_cuda(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); +template void prod_virial_grad_a_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_a_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_r_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_r_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nnei); } // namespace deepmd diff --git a/source/lib/src/cuda/tabulate.cu b/source/lib/src/cuda/tabulate.cu index 92f77ed63b..30695a6e05 100644 --- a/source/lib/src/cuda/tabulate.cu +++ b/source/lib/src/cuda/tabulate.cu @@ -621,16 +621,16 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( namespace deepmd { template -void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -646,18 +646,18 @@ void tabulate_fusion_se_a_gpu_cuda(FPTYPE* out, } template -void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const FPTYPE* dy, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -676,17 +676,17 @@ void tabulate_fusion_se_a_grad_gpu_cuda(FPTYPE* dy_dem_x, } template -void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -703,15 +703,15 @@ void tabulate_fusion_se_a_grad_grad_gpu_cuda(FPTYPE* dz_dy, } template -void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -726,17 +726,17 @@ void tabulate_fusion_se_t_gpu_cuda(FPTYPE* out, } template -void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -755,17 +755,17 @@ void tabulate_fusion_se_t_grad_gpu_cuda(FPTYPE* dy_dem_x, } template -void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -783,13 +783,13 @@ void tabulate_fusion_se_t_grad_grad_gpu_cuda(FPTYPE* dz_dy, } template -void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -804,14 +804,14 @@ void tabulate_fusion_se_r_gpu_cuda(FPTYPE* out, } template -void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const FPTYPE* dy, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -828,14 +828,14 @@ void tabulate_fusion_se_r_grad_gpu_cuda(FPTYPE* dy_dem, } template -void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -851,53 +851,51 @@ void tabulate_fusion_se_r_grad_grad_gpu_cuda(FPTYPE* dz_dy, DPErrcheck(cudaDeviceSynchronize()); } -template void tabulate_fusion_se_a_gpu_cuda(float* out, +template void tabulate_fusion_se_a_gpu(float* out, + const float* table, + const float* table_info, + const float* em_x, + const float* em, + const float* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted); +template void tabulate_fusion_se_a_gpu(double* out, + const double* table, + const double* table_info, + const double* em_x, + const double* em, + const double* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted); +template void tabulate_fusion_se_a_grad_gpu(float* dy_dem_x, + float* dy_dem, const float* table, const float* table_info, const float* em_x, const float* em, const float* two_embed, + const float* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_gpu_cuda(double* out, +template void tabulate_fusion_se_a_grad_gpu(double* dy_dem_x, + double* dy_dem, const double* table, const double* table_info, const double* em_x, const double* em, const double* two_embed, + const double* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu_cuda( - float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* two_embed, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu_cuda( - double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* two_embed, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu_cuda( +template void tabulate_fusion_se_a_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -909,7 +907,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda( const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu_cuda( +template void tabulate_fusion_se_a_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, @@ -922,49 +920,47 @@ template void tabulate_fusion_se_a_grad_grad_gpu_cuda( const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_t_gpu_cuda(float* out, +template void tabulate_fusion_se_t_gpu(float* out, + const float* table, + const float* table_info, + const float* em_x, + const float* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); +template void tabulate_fusion_se_t_gpu(double* out, + const double* table, + const double* table_info, + const double* em_x, + const double* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu(float* dy_dem_x, + float* dy_dem, const float* table, const float* table_info, const float* em_x, const float* em, + const float* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_gpu_cuda(double* out, +template void tabulate_fusion_se_t_grad_gpu(double* dy_dem_x, + double* dy_dem, const double* table, const double* table_info, const double* em_x, const double* em, + const double* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu_cuda( - float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu_cuda( - double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu_cuda( +template void tabulate_fusion_se_t_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -976,7 +972,7 @@ template void tabulate_fusion_se_t_grad_grad_gpu_cuda( const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu_cuda( +template void tabulate_fusion_se_t_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, @@ -989,39 +985,37 @@ template void tabulate_fusion_se_t_grad_grad_gpu_cuda( const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_r_gpu_cuda(float* out, +template void tabulate_fusion_se_r_gpu(float* out, + const float* table, + const float* table_info, + const float* em, + const int nloc, + const int nnei, + const int last_layer_size); +template void tabulate_fusion_se_r_gpu(double* out, + const double* table, + const double* table_info, + const double* em, + const int nloc, + const int nnei, + const int last_layer_size); +template void tabulate_fusion_se_r_grad_gpu(float* dy_dem, const float* table, const float* table_info, const float* em, + const float* dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_gpu_cuda(double* out, +template void tabulate_fusion_se_r_grad_gpu(double* dy_dem, const double* table, const double* table_info, const double* em, + const double* dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu_cuda( - float* dy_dem, - const float* table, - const float* table_info, - const float* em, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu_cuda( - double* dy_dem, - const double* table, - const double* table_info, - const double* em, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu_cuda( +template void tabulate_fusion_se_r_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -1030,7 +1024,7 @@ template void tabulate_fusion_se_r_grad_grad_gpu_cuda( const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu_cuda( +template void tabulate_fusion_se_r_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, diff --git a/source/lib/src/rocm/coord.hip.cu b/source/lib/src/rocm/coord.hip.cu index 198ef87311..5416022575 100644 --- a/source/lib/src/rocm/coord.hip.cu +++ b/source/lib/src/rocm/coord.hip.cu @@ -335,9 +335,9 @@ void copy_coord(FPTYPE *out_c, namespace deepmd { template -void normalize_coord_gpu_rocm(FPTYPE *coord, - const int natom, - const Region ®ion) { +void normalize_coord_gpu(FPTYPE *coord, + const int natom, + const Region ®ion) { const FPTYPE *boxt = region.boxt; const FPTYPE *rec_boxt = region.rec_boxt; const int nblock = (natom + TPB - 1) / TPB; @@ -348,19 +348,19 @@ void normalize_coord_gpu_rocm(FPTYPE *coord, } template -int copy_coord_gpu_rocm(FPTYPE *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const FPTYPE *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion) { +int copy_coord_gpu(FPTYPE *out_c, + int *out_t, + int *mapping, + int *nall, + int *int_data, + const FPTYPE *in_c, + const int *in_t, + const int &nloc, + const int &mem_nall, + const int &loc_cellnum, + const int &total_cellnum, + const int *cell_info, + const Region ®ion) { compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum, total_cellnum); int *int_data_cpu = new int @@ -409,36 +409,36 @@ int copy_coord_gpu_rocm(FPTYPE *out_c, return 0; } -template void normalize_coord_gpu_rocm(float *coord, - const int natom, - const Region ®ion); -template void normalize_coord_gpu_rocm(double *coord, - const int natom, - const Region ®ion); -template int copy_coord_gpu_rocm(float *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const float *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion); -template int copy_coord_gpu_rocm(double *out_c, - int *out_t, - int *mapping, - int *nall, - int *int_data, - const double *in_c, - const int *in_t, - const int &nloc, - const int &mem_nall, - const int &loc_cellnum, - const int &total_cellnum, - const int *cell_info, - const Region ®ion); +template void normalize_coord_gpu(float *coord, + const int natom, + const Region ®ion); +template void normalize_coord_gpu(double *coord, + const int natom, + const Region ®ion); +template int copy_coord_gpu(float *out_c, + int *out_t, + int *mapping, + int *nall, + int *int_data, + const float *in_c, + const int *in_t, + const int &nloc, + const int &mem_nall, + const int &loc_cellnum, + const int &total_cellnum, + const int *cell_info, + const Region ®ion); +template int copy_coord_gpu(double *out_c, + int *out_t, + int *mapping, + int *nall, + int *int_data, + const double *in_c, + const int *in_t, + const int &nloc, + const int &mem_nall, + const int &loc_cellnum, + const int &total_cellnum, + const int *cell_info, + const Region ®ion); } // namespace deepmd diff --git a/source/lib/src/rocm/gelu.hip.cu b/source/lib/src/rocm/gelu.hip.cu index 7dfcb45870..76657eea52 100644 --- a/source/lib/src/rocm/gelu.hip.cu +++ b/source/lib/src/rocm/gelu.hip.cu @@ -64,7 +64,7 @@ __global__ void gelu_grad_grad(FPTYPE* out, namespace deepmd { template -void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size) { +void gelu_gpu(FPTYPE* out, const FPTYPE* xx, const int_64 size) { if (size <= 0) { return; } @@ -77,10 +77,10 @@ void gelu_gpu_rocm(FPTYPE* out, const FPTYPE* xx, const int_64 size) { } template -void gelu_grad_gpu_rocm(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const int_64 size) { +void gelu_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const int_64 size) { if (size <= 0) { return; } @@ -94,11 +94,11 @@ void gelu_grad_gpu_rocm(FPTYPE* out, } template -void gelu_grad_grad_gpu_rocm(FPTYPE* out, - const FPTYPE* xx, - const FPTYPE* dy, - const FPTYPE* dy_2, - const int_64 size) { +void gelu_grad_grad_gpu(FPTYPE* out, + const FPTYPE* xx, + const FPTYPE* dy, + const FPTYPE* dy_2, + const int_64 size) { if (size <= 0) { return; } @@ -111,28 +111,24 @@ void gelu_grad_grad_gpu_rocm(FPTYPE* out, DPErrcheck(hipDeviceSynchronize()); } -template void gelu_gpu_rocm(float* out, +template void gelu_gpu(float* out, const float* x, const int_64 size); +template void gelu_gpu(double* out, const double* x, const int_64 size); +template void gelu_grad_gpu(float* out, const float* x, + const float* dy, const int_64 size); -template void gelu_gpu_rocm(double* out, +template void gelu_grad_gpu(double* out, const double* x, + const double* dy, const int_64 size); -template void gelu_grad_gpu_rocm(float* out, +template void gelu_grad_grad_gpu(float* out, const float* x, const float* dy, + const float* dy_2, const int_64 size); -template void gelu_grad_gpu_rocm(double* out, +template void gelu_grad_grad_gpu(double* out, const double* x, const double* dy, + const double* dy_2, const int_64 size); -template void gelu_grad_grad_gpu_rocm(float* out, - const float* x, - const float* dy, - const float* dy_2, - const int_64 size); -template void gelu_grad_grad_gpu_rocm(double* out, - const double* x, - const double* dy, - const double* dy_2, - const int_64 size); } // namespace deepmd diff --git a/source/lib/src/rocm/neighbor_list.hip.cu b/source/lib/src/rocm/neighbor_list.hip.cu index 34043233ab..736f2f9e9a 100644 --- a/source/lib/src/rocm/neighbor_list.hip.cu +++ b/source/lib/src/rocm/neighbor_list.hip.cu @@ -175,14 +175,14 @@ __global__ void map_nei_info_noconvert(int *nlist, namespace deepmd { template -int build_nlist_gpu_rocm(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const FPTYPE *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut) { +int build_nlist_gpu(InputNlist &nlist, + int *max_list_size, + int *nlist_data, + const FPTYPE *c_cpy, + const int &nloc, + const int &nall, + const int &mem_size, + const float &rcut) { if (mem_size < nall) { return 1; } @@ -237,15 +237,15 @@ void use_nlist_map(int *nlist, DPErrcheck(hipDeviceSynchronize()); } -void use_nei_info_gpu_rocm(int *nlist, - int *ntype, - bool *nmask, - const int *type, - const int *nlist_map, - const int nloc, - const int nnei, - const int ntypes, - const bool b_nlist_map) { +void use_nei_info_gpu(int *nlist, + int *ntype, + bool *nmask, + const int *type, + const int *nlist_map, + const int nloc, + const int nnei, + const int ntypes, + const bool b_nlist_map) { int nblock = (nnei + TPB - 1) / TPB; dim3 block_grid(nloc, nblock); dim3 thread_grid(1, TPB); @@ -262,22 +262,22 @@ void use_nei_info_gpu_rocm(int *nlist, DPErrcheck(hipDeviceSynchronize()); } -template int build_nlist_gpu_rocm(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const float *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut); -template int build_nlist_gpu_rocm(InputNlist &nlist, - int *max_list_size, - int *nlist_data, - const double *c_cpy, - const int &nloc, - const int &nall, - const int &mem_size, - const float &rcut); +template int build_nlist_gpu(InputNlist &nlist, + int *max_list_size, + int *nlist_data, + const float *c_cpy, + const int &nloc, + const int &nall, + const int &mem_size, + const float &rcut); +template int build_nlist_gpu(InputNlist &nlist, + int *max_list_size, + int *nlist_data, + const double *c_cpy, + const int &nloc, + const int &nall, + const int &mem_size, + const float &rcut); __global__ void map_filter_ftype(int *ftype_out, const int *ftype_in, const int nloc) { @@ -287,9 +287,7 @@ __global__ void map_filter_ftype(int *ftype_out, } } -void filter_ftype_gpu_rocm(int *ftype_out, - const int *ftype_in, - const int nloc) { +void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) { int nblock = (nloc + TPB - 1) / TPB; map_filter_ftype<<>>(ftype_out, ftype_in, nloc); DPErrcheck(hipGetLastError()); diff --git a/source/lib/src/rocm/prod_env_mat.hip.cu b/source/lib/src/rocm/prod_env_mat.hip.cu index c2bfbd3cac..23e8ce1d0e 100644 --- a/source/lib/src/rocm/prod_env_mat.hip.cu +++ b/source/lib/src/rocm/prod_env_mat.hip.cu @@ -565,17 +565,17 @@ __global__ void compute_env_mat_r(FPTYPE* em, namespace deepmd { template -void format_nbor_list_gpu_rocm(int* nlist, - const FPTYPE* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec) { +void format_nbor_list_gpu(int* nlist, + const FPTYPE* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec) { const int LEN = 256; const int nnei = sec.back(); const int nblock = (nloc + LEN - 1) / LEN; @@ -619,24 +619,24 @@ void format_nbor_list_gpu_rocm(int* nlist, } template -void prod_env_mat_a_gpu_rocm(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type) { +void prod_env_mat_a_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type) { if (f_type == NULL) { f_type = type; } @@ -647,9 +647,8 @@ void prod_env_mat_a_gpu_rocm(FPTYPE* em, hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - format_nbor_list_gpu_rocm(nlist, coord, f_type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, - sec); + format_nbor_list_gpu(nlist, coord, f_type, gpu_inlist, array_int, + array_longlong, max_nbor_size, nloc, nall, rcut, sec); nborErrcheck(hipGetLastError()); nborErrcheck(hipDeviceSynchronize()); @@ -661,23 +660,23 @@ void prod_env_mat_a_gpu_rocm(FPTYPE* em, } template -void prod_env_mat_r_gpu_rocm(FPTYPE* em, - FPTYPE* em_deriv, - FPTYPE* rij, - int* nlist, - const FPTYPE* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const FPTYPE* avg, - const FPTYPE* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec) { +void prod_env_mat_r_gpu(FPTYPE* em, + FPTYPE* em_deriv, + FPTYPE* rij, + int* nlist, + const FPTYPE* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const FPTYPE* avg, + const FPTYPE* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec) { const int nnei = sec.back(); const int ndescrpt = nnei * 1; DPErrcheck(hipMemset(em, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt)); @@ -685,9 +684,8 @@ void prod_env_mat_r_gpu_rocm(FPTYPE* em, hipMemset(em_deriv, 0, sizeof(FPTYPE) * int_64(nloc) * ndescrpt * 3)); DPErrcheck(hipMemset(rij, 0, sizeof(FPTYPE) * int_64(nloc) * nnei * 3)); - format_nbor_list_gpu_rocm(nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, nloc, nall, rcut, - sec); + format_nbor_list_gpu(nlist, coord, type, gpu_inlist, array_int, + array_longlong, max_nbor_size, nloc, nall, rcut, sec); nborErrcheck(hipGetLastError()); nborErrcheck(hipDeviceSynchronize()); @@ -699,13 +697,13 @@ void prod_env_mat_r_gpu_rocm(FPTYPE* em, } template -void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const FPTYPE* in_dist, - const int* in_index, - const int size_of_array) { +void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const FPTYPE* in_dist, + const int* in_index, + const int size_of_array) { const int nblock = (size_of_array + TPB - 1) / TPB; hipLaunchKernelGGL(encoding_decoding_nbor_info, nblock, TPB, 0, 0, key, out_type, out_index, in_type, in_dist, in_index, @@ -714,116 +712,110 @@ void test_encoding_decoding_nbor_info_gpu_rocm(uint_64* key, DPErrcheck(hipDeviceSynchronize()); } -template void prod_env_mat_a_gpu_rocm(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_a_gpu_rocm( - double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int* f_type); -template void prod_env_mat_r_gpu_rocm(float* em, - float* em_deriv, - float* rij, - int* nlist, - const float* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const float* avg, - const float* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void prod_env_mat_r_gpu_rocm( - double* em, - double* em_deriv, - double* rij, - int* nlist, - const double* coord, - const int* type, - const InputNlist& gpu_inlist, - int* array_int, - unsigned long long* array_longlong, - const int max_nbor_size, - const double* avg, - const double* std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); -template void format_nbor_list_gpu_rocm( - int* nlist, - const float* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void format_nbor_list_gpu_rocm( - int* nlist, - const double* coord, - const int* type, - const deepmd::InputNlist& gpu_inlist, - int* array_int, - uint_64* array_longlong, - const int max_nbor_size, - const int nloc, - const int nall, - const float rcut, - const std::vector sec); -template void test_encoding_decoding_nbor_info_gpu_rocm( - uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const float* in_dist, - const int* in_index, - const int size_of_array); -template void test_encoding_decoding_nbor_info_gpu_rocm( - uint_64* key, - int* out_type, - int* out_index, - const int* in_type, - const double* in_dist, - const int* in_index, - const int size_of_array); +template void prod_env_mat_a_gpu(float* em, + float* em_deriv, + float* rij, + int* nlist, + const float* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const float* avg, + const float* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type); +template void prod_env_mat_a_gpu(double* em, + double* em_deriv, + double* rij, + int* nlist, + const double* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const double* avg, + const double* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int* f_type); +template void prod_env_mat_r_gpu(float* em, + float* em_deriv, + float* rij, + int* nlist, + const float* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const float* avg, + const float* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); +template void prod_env_mat_r_gpu(double* em, + double* em_deriv, + double* rij, + int* nlist, + const double* coord, + const int* type, + const InputNlist& gpu_inlist, + int* array_int, + unsigned long long* array_longlong, + const int max_nbor_size, + const double* avg, + const double* std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec); +template void format_nbor_list_gpu(int* nlist, + const float* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); +template void format_nbor_list_gpu(int* nlist, + const double* coord, + const int* type, + const deepmd::InputNlist& gpu_inlist, + int* array_int, + uint_64* array_longlong, + const int max_nbor_size, + const int nloc, + const int nall, + const float rcut, + const std::vector sec); +template void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const float* in_dist, + const int* in_index, + const int size_of_array); +template void test_encoding_decoding_nbor_info_gpu(uint_64* key, + int* out_type, + int* out_index, + const int* in_type, + const double* in_dist, + const int* in_index, + const int size_of_array); } // namespace deepmd diff --git a/source/lib/src/rocm/prod_force.hip.cu b/source/lib/src/rocm/prod_force.hip.cu index bc4fa15078..5b1f91dd49 100644 --- a/source/lib/src/rocm/prod_force.hip.cu +++ b/source/lib/src/rocm/prod_force.hip.cu @@ -102,14 +102,14 @@ __global__ void force_deriv_wrt_neighbors_r(FPTYPE* force, namespace deepmd { template -void prod_force_a_gpu_rocm(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { +void prod_force_a_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); @@ -130,14 +130,14 @@ void prod_force_a_gpu_rocm(FPTYPE* force, } template -void prod_force_r_gpu_rocm(FPTYPE* force, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes) { +void prod_force_r_gpu(FPTYPE* force, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; DPErrcheck(hipMemset(force, 0, sizeof(FPTYPE) * nframes * nall * 3)); @@ -157,37 +157,37 @@ void prod_force_r_gpu_rocm(FPTYPE* force, DPErrcheck(hipDeviceSynchronize()); } -template void prod_force_a_gpu_rocm(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_a_gpu_rocm(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu_rocm(float* force, - const float* net_deriv, - const float* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); -template void prod_force_r_gpu_rocm(double* force, - const double* net_deriv, - const double* in_deriv, - const int* nlist, - const int nloc, - const int nall, - const int nnei, - const int nframes); +template void prod_force_a_gpu(float* force, + const float* net_deriv, + const float* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_a_gpu(double* force, + const double* net_deriv, + const double* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_r_gpu(float* force, + const float* net_deriv, + const float* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); +template void prod_force_r_gpu(double* force, + const double* net_deriv, + const double* in_deriv, + const int* nlist, + const int nloc, + const int nall, + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/rocm/prod_force_grad.hip.cu b/source/lib/src/rocm/prod_force_grad.hip.cu index e43ce37af6..2cb7c4f1d6 100644 --- a/source/lib/src/rocm/prod_force_grad.hip.cu +++ b/source/lib/src/rocm/prod_force_grad.hip.cu @@ -81,13 +81,13 @@ __global__ void force_grad_wrt_neighbors_r(FPTYPE* grad_net, namespace deepmd { template -void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { +void prod_force_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes) { const int ndescrpt = nnei * 4; DPErrcheck( hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); @@ -109,13 +109,13 @@ void prod_force_grad_a_gpu_rocm(FPTYPE* grad_net, } template -void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes) { +void prod_force_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes) { const int ndescrpt = nnei * 1; DPErrcheck( hipMemset(grad_net, 0, sizeof(FPTYPE) * nframes * nloc * ndescrpt)); @@ -137,32 +137,32 @@ void prod_force_grad_r_gpu_rocm(FPTYPE* grad_net, DPErrcheck(hipDeviceSynchronize()); } -template void prod_force_grad_a_gpu_rocm(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_a_gpu_rocm(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu_rocm(float* grad_net, - const float* grad, - const float* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); -template void prod_force_grad_r_gpu_rocm(double* grad_net, - const double* grad, - const double* env_deriv, - const int* nlist, - const int nloc, - const int nnei, - const int nframes); +template void prod_force_grad_a_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_a_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_r_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); +template void prod_force_grad_r_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const int* nlist, + const int nloc, + const int nnei, + const int nframes); } // namespace deepmd diff --git a/source/lib/src/rocm/prod_virial.hip.cu b/source/lib/src/rocm/prod_virial.hip.cu index dccd721df6..ff29c07ffb 100644 --- a/source/lib/src/rocm/prod_virial.hip.cu +++ b/source/lib/src/rocm/prod_virial.hip.cu @@ -99,15 +99,15 @@ __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, namespace deepmd { template -void prod_virial_a_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { +void prod_virial_a_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9)); DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); @@ -129,15 +129,15 @@ void prod_virial_a_gpu_rocm(FPTYPE* virial, } template -void prod_virial_r_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { +void prod_virial_r_gpu(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { DPErrcheck(hipMemset(virial, 0, sizeof(FPTYPE) * 9)); DPErrcheck(hipMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); @@ -158,40 +158,40 @@ void prod_virial_r_gpu_rocm(FPTYPE* virial, DPErrcheck(hipDeviceSynchronize()); } -template void prod_virial_a_gpu_rocm(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_a_gpu_rocm(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu_rocm(float* virial, - float* atom_virial, - const float* net_deriv, - const float* in_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -template void prod_virial_r_gpu_rocm(double* virial, - double* atom_virial, - const double* net_deriv, - const double* in_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); +template void prod_virial_a_gpu(float* virial, + float* atom_virial, + const float* net_deriv, + const float* in_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_a_gpu(double* virial, + double* atom_virial, + const double* net_deriv, + const double* in_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_r_gpu(float* virial, + float* atom_virial, + const float* net_deriv, + const float* in_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); +template void prod_virial_r_gpu(double* virial, + double* atom_virial, + const double* net_deriv, + const double* in_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei); } // namespace deepmd diff --git a/source/lib/src/rocm/prod_virial_grad.hip.cu b/source/lib/src/rocm/prod_virial_grad.hip.cu index 81fb9f4bad..d41a1689ce 100644 --- a/source/lib/src/rocm/prod_virial_grad.hip.cu +++ b/source/lib/src/rocm/prod_virial_grad.hip.cu @@ -84,13 +84,13 @@ __global__ void virial_grad_wrt_neighbors_r(FPTYPE* grad_net, namespace deepmd { template -void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { +void prod_virial_grad_a_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { const int ndescrpt = nnei * 4; DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -104,13 +104,13 @@ void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net, } template -void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { +void prod_virial_grad_r_gpu(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { const int ndescrpt = nnei; DPErrcheck(hipMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); const int LEN = 128; @@ -123,32 +123,32 @@ void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net, DPErrcheck(hipDeviceSynchronize()); } -template void prod_virial_grad_a_gpu_rocm(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_a_gpu_rocm(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu_rocm(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_r_gpu_rocm(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); +template void prod_virial_grad_a_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_a_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_r_gpu(float* grad_net, + const float* grad, + const float* env_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_r_gpu(double* grad_net, + const double* grad, + const double* env_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nnei); } // namespace deepmd diff --git a/source/lib/src/rocm/region.hip.cu b/source/lib/src/rocm/region.hip.cu index f4ee5517cc..de67ef648c 100644 --- a/source/lib/src/rocm/region.hip.cu +++ b/source/lib/src/rocm/region.hip.cu @@ -24,44 +24,42 @@ __global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) { namespace deepmd { // only for unittest template -void convert_to_inter_gpu_rocm(FPTYPE *ri, - const Region ®ion, - const FPTYPE *rp) { +void convert_to_inter_gpu(FPTYPE *ri, + const Region ®ion, + const FPTYPE *rp) { hipLaunchKernelGGL(_phys2Inter, 1, 1, 0, 0, ri, rp, region.rec_boxt); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } template -void convert_to_phys_gpu_rocm(FPTYPE *rp, - const Region ®ion, - const FPTYPE *ri) { +void convert_to_phys_gpu(FPTYPE *rp, + const Region ®ion, + const FPTYPE *ri) { hipLaunchKernelGGL(_inter2Phys, 1, 1, 0, 0, rp, ri, region.boxt); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } template -void volume_gpu_rocm(FPTYPE *volume, const Region ®ion) { +void volume_gpu(FPTYPE *volume, const Region ®ion) { hipLaunchKernelGGL(_compute_volume, 1, 1, 0, 0, volume, region.boxt); DPErrcheck(hipGetLastError()); DPErrcheck(hipDeviceSynchronize()); } -template void convert_to_inter_gpu_rocm(float *ri, - const Region ®ion, - const float *rp); -template void convert_to_inter_gpu_rocm(double *ri, - const Region ®ion, - const double *rp); -template void convert_to_phys_gpu_rocm(float *rp, - const Region ®ion, - const float *ri); -template void convert_to_phys_gpu_rocm(double *rp, - const Region ®ion, - const double *ri); -template void volume_gpu_rocm(float *volume, - const Region ®ion); -template void volume_gpu_rocm(double *volume, - const Region ®ion); +template void convert_to_inter_gpu(float *ri, + const Region ®ion, + const float *rp); +template void convert_to_inter_gpu(double *ri, + const Region ®ion, + const double *rp); +template void convert_to_phys_gpu(float *rp, + const Region ®ion, + const float *ri); +template void convert_to_phys_gpu(double *rp, + const Region ®ion, + const double *ri); +template void volume_gpu(float *volume, const Region ®ion); +template void volume_gpu(double *volume, const Region ®ion); } // namespace deepmd diff --git a/source/lib/src/rocm/tabulate.hip.cu b/source/lib/src/rocm/tabulate.hip.cu index f88ae6ec4a..88a1cbb574 100644 --- a/source/lib/src/rocm/tabulate.hip.cu +++ b/source/lib/src/rocm/tabulate.hip.cu @@ -621,16 +621,16 @@ __global__ void tabulate_fusion_se_r_grad_grad_fifth_order_polynomial( namespace deepmd { template -void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -645,18 +645,18 @@ void tabulate_fusion_se_a_gpu_rocm(FPTYPE* out, } template -void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* two_embed, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* two_embed, + const FPTYPE* dy, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -675,17 +675,17 @@ void tabulate_fusion_se_a_grad_gpu_rocm(FPTYPE* dy_dem_x, } template -void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted) { +void tabulate_fusion_se_a_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted) { if (nloc <= 0) { return; } @@ -703,15 +703,15 @@ void tabulate_fusion_se_a_grad_grad_gpu_rocm(FPTYPE* dz_dy, } template -void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -726,17 +726,17 @@ void tabulate_fusion_se_t_gpu_rocm(FPTYPE* out, } template -void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x, - FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_grad_gpu(FPTYPE* dy_dem_x, + FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dy, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -754,17 +754,17 @@ void tabulate_fusion_se_t_grad_gpu_rocm(FPTYPE* dy_dem_x, } template -void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em_x, - const FPTYPE* em, - const FPTYPE* dz_dy_dem_x, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size) { +void tabulate_fusion_se_t_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em_x, + const FPTYPE* em, + const FPTYPE* dz_dy_dem_x, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -781,13 +781,13 @@ void tabulate_fusion_se_t_grad_grad_gpu_rocm(FPTYPE* dz_dy, } template -void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_gpu(FPTYPE* out, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -802,14 +802,14 @@ void tabulate_fusion_se_r_gpu_rocm(FPTYPE* out, } template -void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dy, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_grad_gpu(FPTYPE* dy_dem, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const FPTYPE* dy, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -826,14 +826,14 @@ void tabulate_fusion_se_r_grad_gpu_rocm(FPTYPE* dy_dem, } template -void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy, - const FPTYPE* table, - const FPTYPE* table_info, - const FPTYPE* em, - const FPTYPE* dz_dy_dem, - const int nloc, - const int nnei, - const int last_layer_size) { +void tabulate_fusion_se_r_grad_grad_gpu(FPTYPE* dz_dy, + const FPTYPE* table, + const FPTYPE* table_info, + const FPTYPE* em, + const FPTYPE* dz_dy_dem, + const int nloc, + const int nnei, + const int last_layer_size) { if (nloc <= 0) { return; } @@ -850,53 +850,51 @@ void tabulate_fusion_se_r_grad_grad_gpu_rocm(FPTYPE* dz_dy, DPErrcheck(hipDeviceSynchronize()); } -template void tabulate_fusion_se_a_gpu_rocm(float* out, +template void tabulate_fusion_se_a_gpu(float* out, + const float* table, + const float* table_info, + const float* em_x, + const float* em, + const float* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted); +template void tabulate_fusion_se_a_gpu(double* out, + const double* table, + const double* table_info, + const double* em_x, + const double* em, + const double* two_embed, + const int nloc, + const int nnei, + const int last_layer_size, + const bool is_sorted); +template void tabulate_fusion_se_a_grad_gpu(float* dy_dem_x, + float* dy_dem, const float* table, const float* table_info, const float* em_x, const float* em, const float* two_embed, + const float* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_gpu_rocm(double* out, +template void tabulate_fusion_se_a_grad_gpu(double* dy_dem_x, + double* dy_dem, const double* table, const double* table_info, const double* em_x, const double* em, const double* two_embed, + const double* dy, const int nloc, const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu_rocm( - float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* two_embed, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_gpu_rocm( - double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* two_embed, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size, - const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu_rocm( +template void tabulate_fusion_se_a_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -908,7 +906,7 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm( const int nnei, const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_a_grad_grad_gpu_rocm( +template void tabulate_fusion_se_a_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, @@ -921,49 +919,47 @@ template void tabulate_fusion_se_a_grad_grad_gpu_rocm( const int last_layer_size, const bool is_sorted); -template void tabulate_fusion_se_t_gpu_rocm(float* out, +template void tabulate_fusion_se_t_gpu(float* out, + const float* table, + const float* table_info, + const float* em_x, + const float* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); +template void tabulate_fusion_se_t_gpu(double* out, + const double* table, + const double* table_info, + const double* em_x, + const double* em, + const int nloc, + const int nnei_i, + const int nnei_j, + const int last_layer_size); +template void tabulate_fusion_se_t_grad_gpu(float* dy_dem_x, + float* dy_dem, const float* table, const float* table_info, const float* em_x, const float* em, + const float* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_gpu_rocm(double* out, +template void tabulate_fusion_se_t_grad_gpu(double* dy_dem_x, + double* dy_dem, const double* table, const double* table_info, const double* em_x, const double* em, + const double* dy, const int nloc, const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu_rocm( - float* dy_dem_x, - float* dy_dem, - const float* table, - const float* table_info, - const float* em_x, - const float* em, - const float* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_gpu_rocm( - double* dy_dem_x, - double* dy_dem, - const double* table, - const double* table_info, - const double* em_x, - const double* em, - const double* dy, - const int nloc, - const int nnei_i, - const int nnei_j, - const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu_rocm( +template void tabulate_fusion_se_t_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -975,7 +971,7 @@ template void tabulate_fusion_se_t_grad_grad_gpu_rocm( const int nnei_i, const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_t_grad_grad_gpu_rocm( +template void tabulate_fusion_se_t_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, @@ -988,39 +984,37 @@ template void tabulate_fusion_se_t_grad_grad_gpu_rocm( const int nnei_j, const int last_layer_size); -template void tabulate_fusion_se_r_gpu_rocm(float* out, +template void tabulate_fusion_se_r_gpu(float* out, + const float* table, + const float* table_info, + const float* em, + const int nloc, + const int nnei, + const int last_layer_size); +template void tabulate_fusion_se_r_gpu(double* out, + const double* table, + const double* table_info, + const double* em, + const int nloc, + const int nnei, + const int last_layer_size); +template void tabulate_fusion_se_r_grad_gpu(float* dy_dem, const float* table, const float* table_info, const float* em, + const float* dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_gpu_rocm(double* out, +template void tabulate_fusion_se_r_grad_gpu(double* dy_dem, const double* table, const double* table_info, const double* em, + const double* dy, const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu_rocm( - float* dy_dem, - const float* table, - const float* table_info, - const float* em, - const float* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_gpu_rocm( - double* dy_dem, - const double* table, - const double* table_info, - const double* em, - const double* dy, - const int nloc, - const int nnei, - const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu_rocm( +template void tabulate_fusion_se_r_grad_grad_gpu( float* dz_dy, const float* table, const float* table_info, @@ -1029,7 +1023,7 @@ template void tabulate_fusion_se_r_grad_grad_gpu_rocm( const int nloc, const int nnei, const int last_layer_size); -template void tabulate_fusion_se_r_grad_grad_gpu_rocm( +template void tabulate_fusion_se_r_grad_grad_gpu( double* dz_dy, const double* table, const double* table_info, diff --git a/source/lib/tests/test_coord.cc b/source/lib/tests/test_coord.cc index 705c5d57bd..581301b6a7 100644 --- a/source/lib/tests/test_coord.cc +++ b/source/lib/tests/test_coord.cc @@ -164,7 +164,7 @@ TEST_F(TestNormCoord, gpu_case0) { deepmd::malloc_device_memory_sync(out_c_dev, out_c); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -192,7 +192,7 @@ TEST_F(TestNormCoord, gpu_case1) { deepmd::malloc_device_memory_sync(out_c_dev, out_c); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -220,7 +220,7 @@ TEST_F(TestNormCoord, gpu_case2) { deepmd::malloc_device_memory_sync(out_c_dev, out_c); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - deepmd::normalize_coord_gpu_rocm(out_c_dev, natoms, region_dev); + deepmd::normalize_coord_gpu(out_c_dev, natoms, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -553,10 +553,10 @@ TEST_F(TestCopyCoord, gpu) { 1 + nloc); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - int ret = deepmd::copy_coord_gpu_rocm( - out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, - in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, - region_dev); + int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, + int_data_dev, in_c_dev, in_t_dev, nloc, + mem_size, loc_cellnum, total_cellnum, + cell_info_dev, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -628,10 +628,10 @@ TEST_F(TestCopyCoord, gpu_lessmem) { 1 + nloc); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - int ret = deepmd::copy_coord_gpu_rocm( - out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, - in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, - region_dev); + int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, + int_data_dev, in_c_dev, in_t_dev, nloc, + mem_size, loc_cellnum, total_cellnum, + cell_info_dev, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -938,10 +938,10 @@ TEST_F(TestCopyCoordMoreCell, gpu) { 1 + nloc); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - int ret = deepmd::copy_coord_gpu_rocm( - out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, - in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, - region_dev); + int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, + int_data_dev, in_c_dev, in_t_dev, nloc, + mem_size, loc_cellnum, total_cellnum, + cell_info_dev, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); @@ -1013,10 +1013,10 @@ TEST_F(TestCopyCoordMoreCell, gpu_lessmem) { 1 + nloc); region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - int ret = deepmd::copy_coord_gpu_rocm( - out_c_dev, out_t_dev, mapping_dev, &nall, int_data_dev, in_c_dev, - in_t_dev, nloc, mem_size, loc_cellnum, total_cellnum, cell_info_dev, - region_dev); + int ret = deepmd::copy_coord_gpu(out_c_dev, out_t_dev, mapping_dev, &nall, + int_data_dev, in_c_dev, in_t_dev, nloc, + mem_size, loc_cellnum, total_cellnum, + cell_info_dev, region_dev); region_dev.boxt = new_boxt; region_dev.rec_boxt = new_rec_boxt; deepmd::memcpy_device_to_host(out_c_dev, out_c); diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc index 594e8a3601..639f99414d 100644 --- a/source/lib/tests/test_env_mat_a.cc +++ b/source/lib/tests/test_env_mat_a.cc @@ -591,7 +591,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) { } #if GOOGLE_CUDA -TEST_F(TestEnvMatA, prod_gpu_cuda) { +TEST_F(TestEnvMatA, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -641,10 +641,10 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_cuda( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -669,7 +669,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda) { } } -TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) { +TEST_F(TestEnvMatA, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -720,10 +720,10 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_cuda( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); @@ -785,7 +785,7 @@ TEST_F(TestEnvMatA, prod_gpu_cuda_equal_cpu) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestEnvMatA, prod_gpu_rocm) { +TEST_F(TestEnvMatA, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -835,10 +835,10 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_rocm( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -863,7 +863,7 @@ TEST_F(TestEnvMatA, prod_gpu_rocm) { } } -TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) { +TEST_F(TestEnvMatA, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -914,10 +914,10 @@ TEST_F(TestEnvMatA, prod_gpu_rocm_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_rocm( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc index 19d4ea1fd8..f415317929 100644 --- a/source/lib/tests/test_env_mat_a_mix.cc +++ b/source/lib/tests/test_env_mat_a_mix.cc @@ -629,7 +629,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) { } #if GOOGLE_CUDA -TEST_F(TestEnvMatAMix, prod_gpu_cuda) { +TEST_F(TestEnvMatAMix, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -688,7 +688,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_cuda( + deepmd::prod_env_mat_a_gpu( em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); @@ -729,7 +729,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda) { delete[] nmask; } -TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) { +TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -781,7 +781,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_cuda( + deepmd::prod_env_mat_a_gpu( em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); @@ -847,7 +847,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_cuda_equal_cpu) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestEnvMatAMix, prod_gpu_rocm) { +TEST_F(TestEnvMatAMix, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -906,13 +906,13 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_rocm( + deepmd::prod_env_mat_a_gpu( em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); - deepmd::use_nei_info_gpu_rocm(nlist_dev, ntype_dev, nmask_dev, atype_dev, - mapping_dev, nloc, nnei, ntypes, true); + deepmd::use_nei_info_gpu(nlist_dev, ntype_dev, nmask_dev, atype_dev, + mapping_dev, nloc, nnei, ntypes, true); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(ntype_dev, ntype); deepmd::memcpy_device_to_host(nmask_dev, nmask, nloc * nnei); @@ -947,7 +947,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm) { delete[] nmask; } -TEST_F(TestEnvMatAMix, prod_gpu_rocm_equal_cpu) { +TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -999,7 +999,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_rocm_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_a_gpu_rocm( + deepmd::prod_env_mat_a_gpu( em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_dev, gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a, f_atype_cpy_dev); diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc index 258aa1000d..f20a8cbbc3 100644 --- a/source/lib/tests/test_env_mat_r.cc +++ b/source/lib/tests/test_env_mat_r.cc @@ -359,7 +359,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) { } #if GOOGLE_CUDA -TEST_F(TestEnvMatR, prod_gpu_cuda) { +TEST_F(TestEnvMatR, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -410,10 +410,10 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu_cuda( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -438,7 +438,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda) { } } -TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) { +TEST_F(TestEnvMatR, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -489,10 +489,10 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu_cuda( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); @@ -544,7 +544,7 @@ TEST_F(TestEnvMatR, prod_gpu_cuda_equal_cpu) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestEnvMatR, prod_gpu_rocm) { +TEST_F(TestEnvMatR, prod_gpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -595,10 +595,10 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu_rocm( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::delete_device_memory(em_dev); deepmd::delete_device_memory(em_deriv_dev); @@ -623,7 +623,7 @@ TEST_F(TestEnvMatR, prod_gpu_rocm) { } } -TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) { +TEST_F(TestEnvMatR, prod_gpu_equal_cpu) { EXPECT_EQ(nlist_r_cpy.size(), nloc); int tot_nnei = 0; int max_nbor_size = 0; @@ -674,10 +674,10 @@ TEST_F(TestEnvMatR, prod_gpu_rocm_equal_cpu) { deepmd::convert_nlist_gpu_device(gpu_inlist, inlist, memory_dev, max_nbor_size); - deepmd::prod_env_mat_r_gpu_rocm( - em_dev, em_deriv_dev, rij_dev, nlist_dev, posi_cpy_dev, atype_cpy_dev, - gpu_inlist, array_int_dev, array_longlong_dev, max_nbor_size, avg_dev, - std_dev, nloc, nall, rc, rc_smth, sec_a); + deepmd::prod_env_mat_r_gpu(em_dev, em_deriv_dev, rij_dev, nlist_dev, + posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, + avg_dev, std_dev, nloc, nall, rc, rc_smth, sec_a); deepmd::memcpy_device_to_host(em_dev, em); deepmd::memcpy_device_to_host(em_deriv_dev, em_deriv); deepmd::memcpy_device_to_host(rij_dev, rij); diff --git a/source/lib/tests/test_fmt_nlist.cc b/source/lib/tests/test_fmt_nlist.cc index 6d9a59cd36..1d995f8fce 100644 --- a/source/lib/tests/test_fmt_nlist.cc +++ b/source/lib/tests/test_fmt_nlist.cc @@ -314,7 +314,7 @@ TEST_F(TestFormatNlistShortSel, cpu) { } #if GOOGLE_CUDA -TEST_F(TestFormatNlist, gpu_cuda) { +TEST_F(TestFormatNlist, gpu) { std::vector> nlist_a_0, nlist_r_0; build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); @@ -357,9 +357,9 @@ TEST_F(TestFormatNlist, gpu_cuda) { deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist - format_nbor_list_gpu_cuda(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - nloc, nall, rc, sec_a); + format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, nloc, + nall, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); @@ -374,7 +374,7 @@ TEST_F(TestFormatNlist, gpu_cuda) { } } -TEST_F(TestFormatNlistShortSel, gpu_cuda) { +TEST_F(TestFormatNlistShortSel, gpu) { std::vector> nlist_a_0, nlist_r_0; build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); @@ -417,9 +417,9 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) { deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist - format_nbor_list_gpu_cuda(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - nloc, nall, rc, sec_a); + format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, nloc, + nall, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); @@ -434,7 +434,7 @@ TEST_F(TestFormatNlistShortSel, gpu_cuda) { } } -TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) { +TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) { int *valid_type_dev = NULL, *valid_index_dev = NULL, *out_type_dev = NULL, *out_index_dev = NULL; double* valid_dist_dev = NULL; @@ -449,7 +449,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) { deepmd::malloc_device_memory_sync(out_index_dev, out_index); deepmd::malloc_device_memory_sync(key_dev, key); - deepmd::test_encoding_decoding_nbor_info_gpu_cuda( + deepmd::test_encoding_decoding_nbor_info_gpu( key_dev, out_type_dev, out_index_dev, valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array); @@ -470,7 +470,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) { } } -// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu_cuda) +// TEST_F(TestEncodingDecodingNborInfo, invalid_nbor_info_gpu) // { // int * invalid_type_dev = NULL, * invalid_index_dev = NULL, * out_type_dev = // NULL, * out_index_dev = NULL; double * invalid_dist_dev = NULL; uint_64 * @@ -485,7 +485,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) { // deepmd::malloc_device_memory_sync(key_dev, key); // EXPECT_EQ(cudaGetLastError() == cudaSuccess && cudaDeviceSynchronize() == -// cudaSuccess, true); deepmd::test_encoding_decoding_nbor_info_gpu_cuda( +// cudaSuccess, true); deepmd::test_encoding_decoding_nbor_info_gpu( // key_dev, out_type_dev, out_index_dev, // invalid_type_dev, invalid_dist_dev, invalid_index_dev, size_of_array // ); @@ -504,7 +504,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestFormatNlist, gpu_rocm) { +TEST_F(TestFormatNlist, gpu) { std::vector> nlist_a_0, nlist_r_0; build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); @@ -547,9 +547,9 @@ TEST_F(TestFormatNlist, gpu_rocm) { deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist - format_nbor_list_gpu_rocm(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - nloc, nall, rc, sec_a); + format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, nloc, + nall, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); @@ -564,7 +564,7 @@ TEST_F(TestFormatNlist, gpu_rocm) { } } -TEST_F(TestFormatNlistShortSel, gpu_rocm) { +TEST_F(TestFormatNlistShortSel, gpu) { std::vector> nlist_a_0, nlist_r_0; build_nlist(nlist_a_0, nlist_r_0, posi_cpy, nloc, rc, rc, nat_stt, ncell, ext_stt, ext_end, region, ncell); @@ -607,9 +607,9 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) { deepmd::convert_nlist_gpu_device(gpu_inlist, in_nlist, memory_dev, max_nbor_size); // format nlist - format_nbor_list_gpu_rocm(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, - array_int_dev, array_longlong_dev, max_nbor_size, - nloc, nall, rc, sec_a); + format_nbor_list_gpu(nlist_dev, posi_cpy_dev, atype_cpy_dev, gpu_inlist, + array_int_dev, array_longlong_dev, max_nbor_size, nloc, + nall, rc, sec_a); deepmd::memcpy_device_to_host(nlist_dev, nlist); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(posi_cpy_dev); @@ -624,7 +624,7 @@ TEST_F(TestFormatNlistShortSel, gpu_rocm) { } } -TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) { +TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu) { int *valid_type_dev = NULL, *valid_index_dev = NULL, *out_type_dev = NULL, *out_index_dev = NULL; double* valid_dist_dev = NULL; @@ -639,7 +639,7 @@ TEST_F(TestEncodingDecodingNborInfo, valid_nbor_info_gpu_rocm) { deepmd::malloc_device_memory_sync(out_index_dev, out_index); deepmd::malloc_device_memory_sync(key_dev, key); - deepmd::test_encoding_decoding_nbor_info_gpu_rocm( + deepmd::test_encoding_decoding_nbor_info_gpu( key_dev, out_type_dev, out_index_dev, valid_type_dev, valid_dist_dev, valid_index_dev, size_of_array); diff --git a/source/lib/tests/test_gelu.cc b/source/lib/tests/test_gelu.cc index cdfe227809..e680567b9c 100644 --- a/source/lib/tests/test_gelu.cc +++ b/source/lib/tests/test_gelu.cc @@ -146,13 +146,13 @@ TEST_F(TestGelu, gelu_grad_grad_cpu) { } #if GOOGLE_CUDA -TEST_F(TestGelu, gelu_gpu_cuda) { +TEST_F(TestGelu, gelu_gpu) { std::vector gelu(nloc, 0.0); double *gelu_dev = NULL, *xx_dev = NULL; deepmd::malloc_device_memory_sync(gelu_dev, gelu); deepmd::malloc_device_memory_sync(xx_dev, xx); - deepmd::gelu_gpu_cuda(gelu_dev, xx_dev, nloc); + deepmd::gelu_gpu(gelu_dev, xx_dev, nloc); deepmd::memcpy_device_to_host(gelu_dev, gelu); deepmd::delete_device_memory(gelu_dev); deepmd::delete_device_memory(xx_dev); @@ -164,7 +164,7 @@ TEST_F(TestGelu, gelu_gpu_cuda) { } } -TEST_F(TestGelu, gelu_grad_gpu_cuda) { +TEST_F(TestGelu, gelu_grad_gpu) { std::vector dy(100, 1.0); std::vector gelu_grad(nloc, 0.0); @@ -172,7 +172,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad); deepmd::malloc_device_memory_sync(xx_dev, xx); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::gelu_grad_gpu_cuda(gelu_grad_dev, xx_dev, dy_dev, nloc); + deepmd::gelu_grad_gpu(gelu_grad_dev, xx_dev, dy_dev, nloc); deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad); deepmd::delete_device_memory(gelu_grad_dev); deepmd::delete_device_memory(xx_dev); @@ -185,7 +185,7 @@ TEST_F(TestGelu, gelu_grad_gpu_cuda) { } } -TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) { +TEST_F(TestGelu, gelu_grad_grad_gpu) { std::vector dy(100, 1.0); std::vector dy_2(100, 1.0); std::vector gelu_grad_grad(nloc, 0.0); @@ -196,8 +196,8 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(xx_dev, xx); deepmd::malloc_device_memory_sync(dy_dev, dy); deepmd::malloc_device_memory_sync(dy_2_dev, dy_2); - deepmd::gelu_grad_grad_gpu_cuda(gelu_grad_grad_dev, xx_dev, dy_dev, - dy_2_dev, nloc); + deepmd::gelu_grad_grad_gpu(gelu_grad_grad_dev, xx_dev, dy_dev, + dy_2_dev, nloc); deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad); deepmd::delete_device_memory(gelu_grad_grad_dev); deepmd::delete_device_memory(xx_dev); @@ -213,13 +213,13 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestGelu, gelu_gpu_rocm) { +TEST_F(TestGelu, gelu_gpu) { std::vector gelu(nloc, 0.0); double *gelu_dev = NULL, *xx_dev = NULL; deepmd::malloc_device_memory_sync(gelu_dev, gelu); deepmd::malloc_device_memory_sync(xx_dev, xx); - deepmd::gelu_gpu_rocm(gelu_dev, xx_dev, nloc); + deepmd::gelu_gpu(gelu_dev, xx_dev, nloc); deepmd::memcpy_device_to_host(gelu_dev, gelu); deepmd::delete_device_memory(gelu_dev); deepmd::delete_device_memory(xx_dev); @@ -231,7 +231,7 @@ TEST_F(TestGelu, gelu_gpu_rocm) { } } -TEST_F(TestGelu, gelu_grad_gpu_rocm) { +TEST_F(TestGelu, gelu_grad_gpu) { std::vector dy(100, 1.0); std::vector gelu_grad(nloc, 0.0); @@ -239,7 +239,7 @@ TEST_F(TestGelu, gelu_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(gelu_grad_dev, gelu_grad); deepmd::malloc_device_memory_sync(xx_dev, xx); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::gelu_grad_gpu_rocm(gelu_grad_dev, xx_dev, dy_dev, nloc); + deepmd::gelu_grad_gpu(gelu_grad_dev, xx_dev, dy_dev, nloc); deepmd::memcpy_device_to_host(gelu_grad_dev, gelu_grad); deepmd::delete_device_memory(gelu_grad_dev); deepmd::delete_device_memory(xx_dev); @@ -252,7 +252,7 @@ TEST_F(TestGelu, gelu_grad_gpu_rocm) { } } -TEST_F(TestGelu, gelu_grad_grad_gpu_rocm) { +TEST_F(TestGelu, gelu_grad_grad_gpu) { std::vector dy(100, 1.0); std::vector dy_2(100, 1.0); std::vector gelu_grad_grad(nloc, 0.0); @@ -263,8 +263,8 @@ TEST_F(TestGelu, gelu_grad_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(xx_dev, xx); deepmd::malloc_device_memory_sync(dy_dev, dy); deepmd::malloc_device_memory_sync(dy_2_dev, dy_2); - deepmd::gelu_grad_grad_gpu_rocm(gelu_grad_grad_dev, xx_dev, dy_dev, - dy_2_dev, nloc); + deepmd::gelu_grad_grad_gpu(gelu_grad_grad_dev, xx_dev, dy_dev, + dy_2_dev, nloc); deepmd::memcpy_device_to_host(gelu_grad_grad_dev, gelu_grad_grad); deepmd::delete_device_memory(gelu_grad_grad_dev); deepmd::delete_device_memory(xx_dev); diff --git a/source/lib/tests/test_neighbor_list.cc b/source/lib/tests/test_neighbor_list.cc index 3e85b0ee73..985f69b3f4 100644 --- a/source/lib/tests/test_neighbor_list.cc +++ b/source/lib/tests/test_neighbor_list.cc @@ -253,9 +253,8 @@ TEST_F(TestNeighborList, gpu) { deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev); int max_list_size; - int ret = - deepmd::build_nlist_gpu_rocm(nlist_dev, &max_list_size, nlist_data_dev, - c_cpy_dev, nloc, nall, mem_size, rc); + int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev, + c_cpy_dev, nloc, nall, mem_size, rc); EXPECT_EQ(ret, 0); int* ilist = new int[nloc]; @@ -314,9 +313,8 @@ TEST_F(TestNeighborList, gpu_lessmem) { deepmd::InputNlist nlist_dev(nloc, ilist_dev, numneigh_dev, firstneigh_dev); int max_list_size; - int ret = - deepmd::build_nlist_gpu_rocm(nlist_dev, &max_list_size, nlist_data_dev, - c_cpy_dev, nloc, nall, mem_size, rc); + int ret = deepmd::build_nlist_gpu(nlist_dev, &max_list_size, nlist_data_dev, + c_cpy_dev, nloc, nall, mem_size, rc); EXPECT_EQ(ret, 1); deepmd::delete_device_memory(nlist_data_dev); diff --git a/source/lib/tests/test_prod_force_a.cc b/source/lib/tests/test_prod_force_a.cc index 2c3483f845..b51c97e421 100644 --- a/source/lib/tests/test_prod_force_a.cc +++ b/source/lib/tests/test_prod_force_a.cc @@ -134,7 +134,7 @@ TEST_F(TestProdForceA, cpu) { } #if GOOGLE_CUDA -TEST_F(TestProdForceA, gpu_cuda) { +TEST_F(TestProdForceA, gpu) { std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; @@ -146,8 +146,8 @@ TEST_F(TestProdForceA, gpu_cuda) { deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::prod_force_a_gpu_cuda(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei, nframes); + deepmd::prod_force_a_gpu(force_dev, net_deriv_dev, env_deriv_dev, + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -164,7 +164,7 @@ TEST_F(TestProdForceA, gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestProdForceA, gpu_rocm) { +TEST_F(TestProdForceA, gpu) { std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; @@ -176,8 +176,8 @@ TEST_F(TestProdForceA, gpu_rocm) { deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::prod_force_a_gpu_rocm(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei, nframes); + deepmd::prod_force_a_gpu(force_dev, net_deriv_dev, env_deriv_dev, + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); diff --git a/source/lib/tests/test_prod_force_grad_a.cc b/source/lib/tests/test_prod_force_grad_a.cc index 29cac24d1d..4694c4ac3b 100644 --- a/source/lib/tests/test_prod_force_grad_a.cc +++ b/source/lib/tests/test_prod_force_grad_a.cc @@ -153,8 +153,8 @@ TEST_F(TestProdForceGradA, gpu) { deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); - deepmd::prod_force_grad_a_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); + deepmd::prod_force_grad_a_gpu(grad_net_dev, grad_dev, env_deriv_dev, + nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); @@ -183,8 +183,8 @@ TEST_F(TestProdForceGradA, gpu) { deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); - deepmd::prod_force_grad_a_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); + deepmd::prod_force_grad_a_gpu(grad_net_dev, grad_dev, env_deriv_dev, + nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); diff --git a/source/lib/tests/test_prod_force_grad_r.cc b/source/lib/tests/test_prod_force_grad_r.cc index 2674e1607e..31f8b64982 100644 --- a/source/lib/tests/test_prod_force_grad_r.cc +++ b/source/lib/tests/test_prod_force_grad_r.cc @@ -127,8 +127,8 @@ TEST_F(TestProdForceGradR, gpu) { deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); - deepmd::prod_force_grad_r_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); + deepmd::prod_force_grad_r_gpu(grad_net_dev, grad_dev, env_deriv_dev, + nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); @@ -157,8 +157,8 @@ TEST_F(TestProdForceGradR, gpu) { deepmd::malloc_device_memory_sync(grad_dev, grad); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory(grad_net_dev, nframes * nloc * ndescrpt); - deepmd::prod_force_grad_r_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, nlist_dev, nloc, nnei, nframes); + deepmd::prod_force_grad_r_gpu(grad_net_dev, grad_dev, env_deriv_dev, + nlist_dev, nloc, nnei, nframes); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); diff --git a/source/lib/tests/test_prod_force_r.cc b/source/lib/tests/test_prod_force_r.cc index 8920e76b12..7f46aa3244 100644 --- a/source/lib/tests/test_prod_force_r.cc +++ b/source/lib/tests/test_prod_force_r.cc @@ -131,7 +131,7 @@ TEST_F(TestProdForceR, cpu) { } #if GOOGLE_CUDA -TEST_F(TestProdForceR, gpu_cuda) { +TEST_F(TestProdForceR, gpu) { std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; @@ -143,8 +143,8 @@ TEST_F(TestProdForceR, gpu_cuda) { deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::prod_force_r_gpu_cuda(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei, nframes); + deepmd::prod_force_r_gpu(force_dev, net_deriv_dev, env_deriv_dev, + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); @@ -161,7 +161,7 @@ TEST_F(TestProdForceR, gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestProdForceR, gpu_rocm) { +TEST_F(TestProdForceR, gpu) { std::vector force(nframes * nall * 3, 0.0); int n_a_sel = nnei; @@ -173,8 +173,8 @@ TEST_F(TestProdForceR, gpu_rocm) { deepmd::malloc_device_memory_sync(net_deriv_dev, net_deriv); deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); - deepmd::prod_force_r_gpu_rocm(force_dev, net_deriv_dev, env_deriv_dev, - nlist_dev, nloc, nall, nnei, nframes); + deepmd::prod_force_r_gpu(force_dev, net_deriv_dev, env_deriv_dev, + nlist_dev, nloc, nall, nnei, nframes); deepmd::memcpy_device_to_host(force_dev, force); deepmd::delete_device_memory(nlist_dev); diff --git a/source/lib/tests/test_prod_virial_a.cc b/source/lib/tests/test_prod_virial_a.cc index 43244460e6..054a152869 100644 --- a/source/lib/tests/test_prod_virial_a.cc +++ b/source/lib/tests/test_prod_virial_a.cc @@ -179,7 +179,7 @@ TEST_F(TestProdVirialA, cpu) { } #if GOOGLE_CUDA -TEST_F(TestProdVirialA, gpu_cuda) { +TEST_F(TestProdVirialA, gpu) { std::vector virial(9, 0.0); std::vector atom_virial(nall * 9, 0.0); int n_a_sel = nnei; @@ -195,9 +195,9 @@ TEST_F(TestProdVirialA, gpu_cuda) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); - deepmd::prod_virial_a_gpu_cuda(virial_dev, atom_virial_dev, - net_deriv_dev, env_deriv_dev, rij_dev, - nlist_dev, nloc, nall, nnei); + deepmd::prod_virial_a_gpu(virial_dev, atom_virial_dev, net_deriv_dev, + env_deriv_dev, rij_dev, nlist_dev, nloc, + nall, nnei); deepmd::memcpy_device_to_host(virial_dev, virial); deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); @@ -228,7 +228,7 @@ TEST_F(TestProdVirialA, gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestProdVirialA, gpu_rocm) { +TEST_F(TestProdVirialA, gpu) { std::vector virial(9, 0.0); std::vector atom_virial(nall * 9, 0.0); int n_a_sel = nnei; @@ -244,9 +244,9 @@ TEST_F(TestProdVirialA, gpu_rocm) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); - deepmd::prod_virial_a_gpu_rocm(virial_dev, atom_virial_dev, - net_deriv_dev, env_deriv_dev, rij_dev, - nlist_dev, nloc, nall, nnei); + deepmd::prod_virial_a_gpu(virial_dev, atom_virial_dev, net_deriv_dev, + env_deriv_dev, rij_dev, nlist_dev, nloc, + nall, nnei); deepmd::memcpy_device_to_host(virial_dev, virial); deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); diff --git a/source/lib/tests/test_prod_virial_grad_a.cc b/source/lib/tests/test_prod_virial_grad_a.cc index 044d5a07d6..98a08ce5c3 100644 --- a/source/lib/tests/test_prod_virial_grad_a.cc +++ b/source/lib/tests/test_prod_virial_grad_a.cc @@ -150,8 +150,8 @@ TEST_F(TestProdVirialGradA, gpu) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); - deepmd::prod_virial_grad_a_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::prod_virial_grad_a_gpu(grad_net_dev, grad_dev, env_deriv_dev, + rij_dev, nlist_dev, nloc, nnei); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); @@ -184,8 +184,8 @@ TEST_F(TestProdVirialGradA, gpu) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); - deepmd::prod_virial_grad_a_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::prod_virial_grad_a_gpu(grad_net_dev, grad_dev, env_deriv_dev, + rij_dev, nlist_dev, nloc, nnei); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); diff --git a/source/lib/tests/test_prod_virial_grad_r.cc b/source/lib/tests/test_prod_virial_grad_r.cc index 34e940c73c..a0c7dad0db 100644 --- a/source/lib/tests/test_prod_virial_grad_r.cc +++ b/source/lib/tests/test_prod_virial_grad_r.cc @@ -124,8 +124,8 @@ TEST_F(TestProdVirialGradR, gpu) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); - deepmd::prod_virial_grad_r_gpu_cuda( - grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::prod_virial_grad_r_gpu(grad_net_dev, grad_dev, env_deriv_dev, + rij_dev, nlist_dev, nloc, nnei); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); @@ -158,8 +158,8 @@ TEST_F(TestProdVirialGradR, gpu) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); deepmd::malloc_device_memory(grad_net_dev, nloc * ndescrpt); - deepmd::prod_virial_grad_r_gpu_rocm( - grad_net_dev, grad_dev, env_deriv_dev, rij_dev, nlist_dev, nloc, nnei); + deepmd::prod_virial_grad_r_gpu(grad_net_dev, grad_dev, env_deriv_dev, + rij_dev, nlist_dev, nloc, nnei); deepmd::memcpy_device_to_host(grad_net_dev, grad_net); deepmd::delete_device_memory(nlist_dev); deepmd::delete_device_memory(grad_dev); diff --git a/source/lib/tests/test_prod_virial_r.cc b/source/lib/tests/test_prod_virial_r.cc index e38ed1da7e..f1077b6dbc 100644 --- a/source/lib/tests/test_prod_virial_r.cc +++ b/source/lib/tests/test_prod_virial_r.cc @@ -179,7 +179,7 @@ TEST_F(TestProdVirialR, cpu) { } #if GOOGLE_CUDA -TEST_F(TestProdVirialR, gpu_cuda) { +TEST_F(TestProdVirialR, gpu) { std::vector virial(9, 0.0); std::vector atom_virial(nall * 9, 0.0); int n_a_sel = nnei; @@ -195,9 +195,9 @@ TEST_F(TestProdVirialR, gpu_cuda) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); - deepmd::prod_virial_r_gpu_cuda(virial_dev, atom_virial_dev, - net_deriv_dev, env_deriv_dev, rij_dev, - nlist_dev, nloc, nall, nnei); + deepmd::prod_virial_r_gpu(virial_dev, atom_virial_dev, net_deriv_dev, + env_deriv_dev, rij_dev, nlist_dev, nloc, + nall, nnei); deepmd::memcpy_device_to_host(virial_dev, virial); deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); @@ -228,7 +228,7 @@ TEST_F(TestProdVirialR, gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestProdVirialR, gpu_rocm) { +TEST_F(TestProdVirialR, gpu) { std::vector virial(9, 0.0); std::vector atom_virial(nall * 9, 0.0); int n_a_sel = nnei; @@ -244,9 +244,9 @@ TEST_F(TestProdVirialR, gpu_rocm) { deepmd::malloc_device_memory_sync(env_deriv_dev, env_deriv); deepmd::malloc_device_memory_sync(rij_dev, rij); - deepmd::prod_virial_r_gpu_rocm(virial_dev, atom_virial_dev, - net_deriv_dev, env_deriv_dev, rij_dev, - nlist_dev, nloc, nall, nnei); + deepmd::prod_virial_r_gpu(virial_dev, atom_virial_dev, net_deriv_dev, + env_deriv_dev, rij_dev, nlist_dev, nloc, + nall, nnei); deepmd::memcpy_device_to_host(virial_dev, virial); deepmd::memcpy_device_to_host(atom_virial_dev, atom_virial); diff --git a/source/lib/tests/test_simulation_region.cc b/source/lib/tests/test_simulation_region.cc index 467564d44b..6f1db46bb0 100644 --- a/source/lib/tests/test_simulation_region.cc +++ b/source/lib/tests/test_simulation_region.cc @@ -180,14 +180,14 @@ TEST_F(TestRegion, gpu) { double vol[1]; double* vol_dev = NULL; deepmd::malloc_device_memory(vol_dev, 1); - deepmd::volume_gpu_rocm(vol_dev, region_dev); + deepmd::volume_gpu(vol_dev, region_dev); deepmd::memcpy_device_to_host(vol_dev, vol, 1); EXPECT_LT(fabs(vol[0] - expected_vol), 1e-10); // check conversion between phys and inter coords. double ri[3]; double* ri_dev = NULL; deepmd::malloc_device_memory(ri_dev, 3); - deepmd::convert_to_inter_gpu_rocm(ri_dev, region_dev, ref_rp_dev); + deepmd::convert_to_inter_gpu(ri_dev, region_dev, ref_rp_dev); deepmd::memcpy_device_to_host(ri_dev, ri, 3); for (int ii = 0; ii < 3; ++ii) { EXPECT_LT(fabs(ri[ii] - ref_ri[ii]), 1e-10); @@ -195,7 +195,7 @@ TEST_F(TestRegion, gpu) { double rp2[3]; double* rp2_dev = NULL; deepmd::malloc_device_memory(rp2_dev, 3); - deepmd::convert_to_phys_gpu_rocm(rp2_dev, region_dev, ri_dev); + deepmd::convert_to_phys_gpu(rp2_dev, region_dev, ri_dev); deepmd::memcpy_device_to_host(rp2_dev, rp2, 3); for (int ii = 0; ii < 3; ++ii) { EXPECT_LT(fabs(rp2[ii] - ref_rp[ii]), 1e-10); @@ -203,7 +203,7 @@ TEST_F(TestRegion, gpu) { double rp[3]; double* rp_dev = NULL; deepmd::malloc_device_memory(rp_dev, 3); - deepmd::convert_to_phys_gpu_rocm(rp_dev, region_dev, ref_ri_dev); + deepmd::convert_to_phys_gpu(rp_dev, region_dev, ref_ri_dev); deepmd::memcpy_device_to_host(rp_dev, rp, 3); for (int ii = 0; ii < 3; ++ii) { EXPECT_LT(fabs(rp[ii] - ref_rp[ii]), 1e-10); @@ -211,7 +211,7 @@ TEST_F(TestRegion, gpu) { double ri2[3]; double* ri2_dev = NULL; deepmd::malloc_device_memory(ri2_dev, 3); - deepmd::convert_to_inter_gpu_rocm(ri2_dev, region_dev, rp_dev); + deepmd::convert_to_inter_gpu(ri2_dev, region_dev, rp_dev); deepmd::memcpy_device_to_host(ri2_dev, ri2, 3); for (int ii = 0; ii < 3; ++ii) { EXPECT_LT(fabs(ri2[ii] - ref_ri[ii]), 1e-10); diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc index 4c87a24566..6f76f9c2ee 100644 --- a/source/lib/tests/test_tabulate_se_a.cc +++ b/source/lib/tests/test_tabulate_se_a.cc @@ -756,7 +756,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_cpu) { } #if GOOGLE_CUDA -TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) { +TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL, @@ -765,9 +765,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_a_gpu_cuda( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, nloc, - nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, nullptr, nloc, + nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); @@ -779,9 +779,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) { double *two_embed_dev = nullptr; deepmd::malloc_device_memory_sync(two_embed_dev, two_embed); deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter); - deepmd::tabulate_fusion_se_a_gpu_cuda( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, - nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, two_embed_dev, + nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); @@ -798,7 +798,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_cuda) { deepmd::delete_device_memory(two_embed_dev); } -TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) { +TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); std::vector dy(nloc * nnei * last_layer_size, 1.0); @@ -811,7 +811,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_a_grad_gpu_cuda( + deepmd::tabulate_fusion_se_a_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); @@ -832,7 +832,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(two_embed_dev, two_embed); deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); - deepmd::tabulate_fusion_se_a_grad_gpu_cuda( + deepmd::tabulate_fusion_se_a_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); @@ -855,7 +855,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) { +TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL, @@ -864,9 +864,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_a_gpu_rocm( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, nloc, - nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, nullptr, nloc, + nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); @@ -878,9 +878,9 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) { double *two_embed_dev = nullptr; deepmd::malloc_device_memory_sync(two_embed_dev, two_embed); deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter); - deepmd::tabulate_fusion_se_a_gpu_rocm( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, - nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, two_embed_dev, + nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); EXPECT_EQ(xyz_scatter.size(), nloc * nnei * last_layer_size); @@ -897,7 +897,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu_rocm) { deepmd::delete_device_memory(two_embed_dev); } -TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) { +TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); std::vector dy(nloc * nnei * last_layer_size, 1.0); @@ -910,7 +910,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_a_grad_gpu_rocm( + deepmd::tabulate_fusion_se_a_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, nullptr, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); @@ -931,7 +931,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(two_embed_dev, two_embed); deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x); deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem); - deepmd::tabulate_fusion_se_a_grad_gpu_rocm( + deepmd::tabulate_fusion_se_a_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, two_embed_dev, dy_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); diff --git a/source/lib/tests/test_tabulate_se_r.cc b/source/lib/tests/test_tabulate_se_r.cc index 95ccfdf59e..5097451aab 100644 --- a/source/lib/tests/test_tabulate_se_r.cc +++ b/source/lib/tests/test_tabulate_se_r.cc @@ -607,16 +607,15 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_cpu) { } #if GOOGLE_CUDA -TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_cuda) { +TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_dev = NULL; deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter); deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_r_gpu_cuda(xyz_scatter_dev, table_dev, - &info[0], em_dev, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_r_gpu(xyz_scatter_dev, table_dev, &info[0], + em_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); deepmd::delete_device_memory(xyz_scatter_dev); deepmd::delete_device_memory(table_dev); @@ -629,7 +628,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_cuda) { } } -TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) { +TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) { std::vector dy_dem(em.size(), 0.0); std::vector dy(nloc * nnei * last_layer_size, 1.0); @@ -638,9 +637,9 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_r_grad_gpu_cuda( - dy_dem_dev, table_dev, &info[0], em_dev, dy_dev, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem_dev, table_dev, &info[0], + em_dev, dy_dev, nloc, nnei, + last_layer_size); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); deepmd::delete_device_memory(dy_dem_dev); deepmd::delete_device_memory(table_dev); @@ -657,16 +656,15 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_rocm) { +TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu) { std::vector xyz_scatter(nloc * nnei * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_dev = NULL; deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter); deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_r_gpu_rocm(xyz_scatter_dev, table_dev, - &info[0], em_dev, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_r_gpu(xyz_scatter_dev, table_dev, &info[0], + em_dev, nloc, nnei, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); deepmd::delete_device_memory(xyz_scatter_dev); deepmd::delete_device_memory(table_dev); @@ -679,7 +677,7 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_gpu_rocm) { } } -TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_rocm) { +TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu) { std::vector dy_dem(em.size(), 0.0); std::vector dy(nloc * nnei * last_layer_size, 1.0); @@ -688,9 +686,9 @@ TEST_F(TestTabulateSeR, tabulate_fusion_se_r_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_r_grad_gpu_rocm( - dy_dem_dev, table_dev, &info[0], em_dev, dy_dev, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem_dev, table_dev, &info[0], + em_dev, dy_dev, nloc, nnei, + last_layer_size); deepmd::memcpy_device_to_host(dy_dem_dev, dy_dem); deepmd::delete_device_memory(dy_dem_dev); deepmd::delete_device_memory(table_dev); diff --git a/source/lib/tests/test_tabulate_se_t.cc b/source/lib/tests/test_tabulate_se_t.cc index 522eef48cd..ffb1b41220 100644 --- a/source/lib/tests/test_tabulate_se_t.cc +++ b/source/lib/tests/test_tabulate_se_t.cc @@ -5261,7 +5261,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_cpu) { } #if GOOGLE_CUDA -TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) { +TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) { std::vector xyz_scatter(nloc * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL, *em_dev = NULL; @@ -5269,9 +5269,9 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_t_gpu_cuda( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei_i, - nnei_j, last_layer_size); + deepmd::tabulate_fusion_se_t_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, nloc, nnei_i, + nnei_j, last_layer_size); // deepmd::tabulate_fusion_se_t_cpu(&xyz_scatter[0], &table[0], // &info[0], &em_x[0], &em[0], nloc, nnei_i, nnei_j, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); @@ -5287,7 +5287,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_cuda) { } } -TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) { +TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); @@ -5299,7 +5299,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) { deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_t_grad_gpu_cuda( + deepmd::tabulate_fusion_se_t_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei_i, nnei_j, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); @@ -5325,7 +5325,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_a_grad_gpu_cuda) { #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM -TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) { +TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu) { std::vector xyz_scatter(nloc * last_layer_size, 0.0); double *xyz_scatter_dev = NULL, *table_dev = NULL, *em_x_dev = NULL, *em_dev = NULL; @@ -5333,9 +5333,9 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) { deepmd::malloc_device_memory_sync(table_dev, table); deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); - deepmd::tabulate_fusion_se_t_gpu_rocm( - xyz_scatter_dev, table_dev, &info[0], em_x_dev, em_dev, nloc, nnei_i, - nnei_j, last_layer_size); + deepmd::tabulate_fusion_se_t_gpu(xyz_scatter_dev, table_dev, &info[0], + em_x_dev, em_dev, nloc, nnei_i, + nnei_j, last_layer_size); deepmd::memcpy_device_to_host(xyz_scatter_dev, xyz_scatter); deepmd::delete_device_memory(xyz_scatter_dev); deepmd::delete_device_memory(table_dev); @@ -5349,7 +5349,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_gpu_rocm) { } } -TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu_rocm) { +TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu) { std::vector dy_dem_x(em_x.size(), 0.0); std::vector dy_dem(em.size(), 0.0); @@ -5361,7 +5361,7 @@ TEST_F(TestTabulateSeT, tabulate_fusion_se_t_grad_gpu_rocm) { deepmd::malloc_device_memory_sync(em_x_dev, em_x); deepmd::malloc_device_memory_sync(em_dev, em); deepmd::malloc_device_memory_sync(dy_dev, dy); - deepmd::tabulate_fusion_se_t_grad_gpu_rocm( + deepmd::tabulate_fusion_se_t_grad_gpu( dy_dem_x_dev, dy_dem_dev, table_dev, &info[0], em_x_dev, em_dev, dy_dev, nloc, nnei_i, nnei_j, last_layer_size); deepmd::memcpy_device_to_host(dy_dem_x_dev, dy_dem_x); diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc index 92c3968b9c..ccc95aa0e4 100644 --- a/source/op/gelu_multi_device.cc +++ b/source/op/gelu_multi_device.cc @@ -65,11 +65,11 @@ class GeluOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::gelu_gpu_cuda(out, x, size); + deepmd::gelu_gpu(out, x, size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::gelu_gpu_rocm(out, x, size); + deepmd::gelu_gpu(out, x, size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_cpu(out, x, size); @@ -109,11 +109,11 @@ class GeluGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::gelu_grad_gpu_cuda(out, x, dy, size); + deepmd::gelu_grad_gpu(out, x, dy, size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::gelu_grad_gpu_rocm(out, x, dy, size); + deepmd::gelu_grad_gpu(out, x, dy, size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_cpu(out, x, dy, size); @@ -155,11 +155,11 @@ class GeluGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::gelu_grad_grad_gpu_cuda(out, x, dy, dy_2, size); + deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::gelu_grad_grad_gpu_rocm(out, x, dy, dy_2, size); + deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size); diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 73a0d3c4c1..a90f81d079 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -320,76 +320,76 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context, #if TENSORFLOW_USE_ROCM template -static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, +static int _norm_copy_coord_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE*& coord_cpy, + int*& type_cpy, + int*& idx_mapping, + int& nall, + int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const int& max_cpy_trial, + const float& rcut_r); + +template +static int _build_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int& max_nnei, + int& mem_nnei, + const FPTYPE* coord, + const int& nloc, + const int& new_nall, + const int& max_nnei_trial, + const float& rcut_r); + +static void _map_nlist_gpu(int* nlist, + const int* idx_mapping, + const int& nloc, + const int& nnei); + +static void _map_nei_info_gpu(int* nlist, + int* ntype, + bool* nmask, + const int* type, + const int* idx_mapping, + const int& nloc, + const int& nnei, + const int& ntypes, + const bool& b_nlist_map); + +template +static void _prepare_coord_nlist_gpu(OpKernelContext* context, Tensor* tensor_list, + FPTYPE const** coord, FPTYPE*& coord_cpy, + int const** type, int*& type_cpy, int*& idx_mapping, - int& nall, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, int& mem_cpy, - const FPTYPE* coord, + int& mem_nnei, + int& max_nbor_size, const FPTYPE* box, - const int* type, + const int* mesh_tensor_data, + const int mesh_tensor_size, const int& nloc, + const int& nei_mode, + const float& rcut_r, const int& max_cpy_trial, - const float& rcut_r); - -template -static int _build_nlist_gpu_rocm(OpKernelContext* context, - Tensor* tensor_list, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int& max_nnei, - int& mem_nnei, - const FPTYPE* coord, - const int& nloc, - const int& new_nall, - const int& max_nnei_trial, - const float& rcut_r); - -static void _map_nlist_gpu_rocm(int* nlist, - const int* idx_mapping, - const int& nloc, - const int& nnei); - -static void _map_nei_info_gpu_rocm(int* nlist, - int* ntype, - bool* nmask, - const int* type, - const int* idx_mapping, - const int& nloc, - const int& nnei, - const int& ntypes, - const bool& b_nlist_map); - -template -static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); + const int& max_nnei_trial); #endif // TENSORFLOW_USE_ROCM @@ -605,10 +605,10 @@ class ProdEnvMatAOp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, - frame_nall, rcut_r, rcut_r_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a); if (b_nlist_map) { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } @@ -627,7 +627,7 @@ class ProdEnvMatAOp : public OpKernel { int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); std::vector tensor_list(7); // prepare coord and nlist - _prepare_coord_nlist_gpu_rocm( + _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, @@ -650,12 +650,12 @@ class ProdEnvMatAOp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu_rocm(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, - frame_nall, rcut_r, rcut_r_smth, sec_a); + deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a); if (b_nlist_map) { - _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei); + _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } deepmd::delete_device_memory(firstneigh); #endif // TENSORFLOW_USE_ROCM @@ -900,10 +900,10 @@ class ProdEnvMatROp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_r_gpu_cuda(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, - frame_nall, rcut, rcut_smth, sec); + deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut, rcut_smth, sec); if (b_nlist_map) { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } @@ -922,7 +922,7 @@ class ProdEnvMatROp : public OpKernel { int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); std::vector tensor_list(7); // prepare coord and nlist - _prepare_coord_nlist_gpu_rocm( + _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, @@ -945,12 +945,12 @@ class ProdEnvMatROp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_r_gpu_rocm(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, - frame_nall, rcut, rcut_smth, sec); + deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut, rcut_smth, sec); if (b_nlist_map) { - _map_nlist_gpu_rocm(nlist, idx_mapping, nloc, nnei); + _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } deepmd::delete_device_memory(firstneigh); #endif // TENSORFLOW_USE_ROCM @@ -1186,10 +1186,10 @@ class ProdEnvMatAMixOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::filter_ftype_gpu_cuda(p_f_type, p_type, nsamples * nall); + deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall); #endif #if TENSORFLOW_USE_ROCM - deepmd::filter_ftype_gpu_rocm(p_f_type, p_type, nsamples * nall); + deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall); #endif } else if (device == "CPU") { for (int ii = 0; ii < nsamples * nall; ii++) { @@ -1246,10 +1246,10 @@ class ProdEnvMatAMixOp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu_cuda( - em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, - rcut_r_smth, sec_a, f_type); + deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a, f_type); _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map); deepmd::delete_device_memory(firstneigh); @@ -1267,7 +1267,7 @@ class ProdEnvMatAMixOp : public OpKernel { int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); std::vector tensor_list(7); // prepare coord and nlist - _prepare_coord_nlist_gpu_rocm( + _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, @@ -1290,12 +1290,12 @@ class ProdEnvMatAMixOp : public OpKernel { array_longlong = uint64_temp.flat().data(); // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu_rocm( - em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, - array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, - rcut_r_smth, sec_a, f_type); - _map_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc, - nnei, ntypes, b_nlist_map); + deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a, f_type); + _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, + ntypes, b_nlist_map); deepmd::delete_device_memory(firstneigh); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { @@ -1802,19 +1802,19 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context, #if TENSORFLOW_USE_ROCM template -static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE*& coord_cpy, - int*& type_cpy, - int*& idx_mapping, - int& nall, - int& mem_cpy, - const FPTYPE* coord, - const FPTYPE* box, - const int* type, - const int& nloc, - const int& max_cpy_trial, - const float& rcut_r) { +static int _norm_copy_coord_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE*& coord_cpy, + int*& type_cpy, + int*& idx_mapping, + int& nall, + int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const int& max_cpy_trial, + const float& rcut_r) { // Tensor FPTYPE_temp; TensorShape FPTYPE_shape; FPTYPE_shape.AddDim(nall * 3); @@ -1857,7 +1857,7 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, FPTYPE* new_rec_boxt = region_dev.rec_boxt; region_dev.boxt = box_info_dev; region_dev.rec_boxt = box_info_dev + 9; - deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev); + deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); int tt; for (tt = 0; tt < max_cpy_trial; ++tt) { // Tensor cpy_temp; @@ -1872,7 +1872,7 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, coord_cpy = (*(tensor_list + 3)).flat().data(); type_cpy = (*(tensor_list + 4)).flat().data(); idx_mapping = type_cpy + mem_cpy; - int ret = deepmd::copy_coord_gpu_rocm( + int ret = deepmd::copy_coord_gpu( coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev); if (ret == 0) { @@ -1887,19 +1887,19 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, } template -static int _build_nlist_gpu_rocm(OpKernelContext* context, - Tensor* tensor_list, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int& max_nnei, - int& mem_nnei, - const FPTYPE* coord, - const int& nloc, - const int& new_nall, - const int& max_nnei_trial, - const float& rcut_r) { +static int _build_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int& max_nnei, + int& mem_nnei, + const FPTYPE* coord, + const int& nloc, + const int& new_nall, + const int& max_nnei_trial, + const float& rcut_r) { // Tensor nlist_temp; TensorShape nlist_shape; nlist_shape.AddDim(nloc * 2); @@ -1929,8 +1929,8 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context, } deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); - int ret = deepmd::build_nlist_gpu_rocm(inlist, &max_nnei, ind_data, coord, - nloc, new_nall, mem_nnei, rcut_r); + int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc, + new_nall, mem_nnei, rcut_r); if (ret == 0) { break; } else { @@ -1940,58 +1940,58 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context, return (tt != max_nnei_trial); } -static void _map_nlist_gpu_rocm(int* nlist, - const int* idx_mapping, - const int& nloc, - const int& nnei) { +static void _map_nlist_gpu(int* nlist, + const int* idx_mapping, + const int& nloc, + const int& nnei) { deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei); } -static void _map_nei_info_gpu_rocm(int* nlist, - int* ntype, - bool* nmask, - const int* type, - const int* idx_mapping, - const int& nloc, - const int& nnei, - const int& ntypes, - const bool& b_nlist_map) { - deepmd::use_nei_info_gpu_rocm(nlist, ntype, nmask, type, idx_mapping, nloc, - nnei, ntypes, b_nlist_map); +static void _map_nei_info_gpu(int* nlist, + int* ntype, + bool* nmask, + const int* type, + const int* idx_mapping, + const int& nloc, + const int& nnei, + const int& ntypes, + const bool& b_nlist_map) { + deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, + ntypes, b_nlist_map); } template -static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial) { +static void _prepare_coord_nlist_gpu(OpKernelContext* context, + Tensor* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { if (nei_mode != 3 && nei_mode != 4) { inlist.inum = nloc; // build nlist by myself // normalize and copy coord if (nei_mode == 1) { - int copy_ok = _norm_copy_coord_gpu_rocm( + int copy_ok = _norm_copy_coord_gpu( context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r); OP_REQUIRES(context, copy_ok, @@ -2001,9 +2001,9 @@ static void _prepare_coord_nlist_gpu_rocm(OpKernelContext* context, } // build nlist int build_ok = - _build_nlist_gpu_rocm(context, tensor_list + 5, ilist, numneigh, - firstneigh, jlist, max_nbor_size, mem_nnei, - *coord, nloc, new_nall, max_nnei_trial, rcut_r); + _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh, + jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall, + max_nnei_trial, rcut_r); OP_REQUIRES(context, build_ok, errors::Aborted("cannot allocate mem for nlist")); if (max_nbor_size <= 1024) { diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc index 228b76e962..7d8a664a8d 100644 --- a/source/op/prod_force_grad_multi_device.cc +++ b/source/op/prod_force_grad_multi_device.cc @@ -122,13 +122,13 @@ class ProdForceSeAGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_grad_a_gpu_cuda(p_grad_net, p_grad, p_in_deriv, - p_nlist, nloc, nnei, nframes); + deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_grad_a_gpu_rocm(p_grad_net, p_grad, p_in_deriv, - p_nlist, nloc, nnei, nframes); + deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, @@ -235,13 +235,13 @@ class ProdForceSeRGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_grad_r_gpu_cuda(p_grad_net, p_grad, p_in_deriv, - p_nlist, nloc, nnei, nframes); + deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_grad_r_gpu_rocm(p_grad_net, p_grad, p_in_deriv, - p_nlist, nloc, nnei, nframes); + deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, + nloc, nnei, nframes); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc index 036064b02d..9d553b1f0c 100644 --- a/source/op/prod_force_multi_device.cc +++ b/source/op/prod_force_multi_device.cc @@ -143,13 +143,13 @@ class ProdForceSeAOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_a_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist, - nloc, nall, nnei, nframes); + deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_a_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist, - nloc, nall, nnei, nframes); + deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, @@ -229,13 +229,13 @@ class ProdForceSeROp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_force_r_gpu_cuda(p_force, p_net_deriv, p_in_deriv, p_nlist, - nloc, nall, nnei, nframes); + deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_force_r_gpu_rocm(p_force, p_net_deriv, p_in_deriv, p_nlist, - nloc, nall, nnei, nframes); + deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, + nall, nnei, nframes); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc index 1c035f53ca..ef7d10b3bd 100644 --- a/source/op/prod_virial_grad_multi_device.cc +++ b/source/op/prod_virial_grad_multi_device.cc @@ -143,13 +143,13 @@ class ProdVirialSeAGradOp : public OpKernel { const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_virial_grad_a_gpu_cuda(grad_net, grad, in_deriv, rij, - nlist, nloc, nnei); + deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist, + nloc, nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_virial_grad_a_gpu_rocm(grad_net, grad, in_deriv, rij, - nlist, nloc, nnei); + deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist, + nloc, nnei); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist, @@ -276,13 +276,13 @@ class ProdVirialSeRGradOp : public OpKernel { const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_virial_grad_r_gpu_cuda(grad_net, grad, in_deriv, rij, - nlist, nloc, nnei); + deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist, + nloc, nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_virial_grad_r_gpu_rocm(grad_net, grad, in_deriv, rij, - nlist, nloc, nnei); + deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist, + nloc, nnei); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_r_cpu(grad_net, grad, in_deriv, rij, nlist, diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc index db13617362..e3960fc37d 100644 --- a/source/op/prod_virial_multi_device.cc +++ b/source/op/prod_virial_multi_device.cc @@ -121,13 +121,13 @@ class ProdVirialSeAOp : public OpKernel { const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_virial_a_gpu_cuda(virial, atom_virial, net_deriv, in_deriv, - rij, nlist, nloc, nall, nnei); + deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij, + nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_virial_a_gpu_rocm(virial, atom_virial, net_deriv, in_deriv, - rij, nlist, nloc, nall, nnei); + deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij, + nlist, nloc, nall, nnei); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij, @@ -225,13 +225,13 @@ class ProdVirialSeROp : public OpKernel { const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { #if GOOGLE_CUDA - deepmd::prod_virial_r_gpu_cuda(virial, atom_virial, net_deriv, in_deriv, - rij, nlist, nloc, nall, nnei); + deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij, + nlist, nloc, nall, nnei); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::prod_virial_r_gpu_rocm(virial, atom_virial, net_deriv, in_deriv, - rij, nlist, nloc, nall, nnei); + deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij, + nlist, nloc, nall, nnei); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_r_cpu(virial, atom_virial, net_deriv, in_deriv, rij, diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 0ac8745f64..886b9d9a6d 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -197,15 +197,13 @@ class TabulateFusionSeAOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x, - em, two_embed, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, + two_embed, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x, - em, two_embed, nloc, nnei, - last_layer_size); + deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, + two_embed, nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, @@ -269,15 +267,15 @@ class TabulateFusionSeAGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_gpu_cuda( - dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, two_embed, dy, nloc, nnei, + last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_grad_gpu_rocm( - dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size); + deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, two_embed, dy, nloc, nnei, + last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, @@ -333,12 +331,12 @@ class TabulateFusionSeAGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_grad_gpu_cuda( + deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size, is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_grad_grad_gpu_rocm( + deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size, is_sorted); #endif // TENSORFLOW_USE_ROCM @@ -411,15 +409,15 @@ class TabulateFusionSeAttenOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_gpu_cuda(descriptor, table, table_info, em_x, - em, two_embed, nloc, nnei, - last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, + two_embed, nloc, nnei, last_layer_size, + is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_gpu_rocm(descriptor, table, table_info, em_x, - em, two_embed, nloc, nnei, - last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, + two_embed, nloc, nnei, last_layer_size, + is_sorted); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, @@ -492,15 +490,15 @@ class TabulateFusionSeAttenGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_gpu_cuda( - dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, two_embed, dy, nloc, nnei, + last_layer_size, is_sorted); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_a_grad_gpu_rocm( - dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, - nnei, last_layer_size, is_sorted); + deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, two_embed, dy, nloc, nnei, + last_layer_size, is_sorted); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, @@ -562,15 +560,13 @@ class TabulateFusionSeTOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_gpu_cuda(descriptor, table, table_info, em_x, - em, nloc, nnei_i, nnei_j, - last_layer_size); + deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em, + nloc, nnei_i, nnei_j, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_t_gpu_rocm(descriptor, table, table_info, em_x, - em, nloc, nnei_i, nnei_j, - last_layer_size); + deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em, + nloc, nnei_i, nnei_j, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em, @@ -632,15 +628,15 @@ class TabulateFusionSeTGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_grad_gpu_cuda( - dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i, - nnei_j, last_layer_size); + deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, dy, nloc, nnei_i, nnei_j, + last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_t_grad_gpu_rocm( - dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i, - nnei_j, last_layer_size); + deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info, + em_x, em, dy, nloc, nnei_i, nnei_j, + last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info, @@ -695,12 +691,12 @@ class TabulateFusionSeTGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_grad_grad_gpu_cuda( + deepmd::tabulate_fusion_se_t_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_t_grad_grad_gpu_rocm( + deepmd::tabulate_fusion_se_t_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); #endif // TENSORFLOW_USE_ROCM @@ -763,13 +759,13 @@ class TabulateFusionSeROp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_gpu_cuda(descriptor, table, table_info, em, - nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc, + nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_r_gpu_rocm(descriptor, table, table_info, em, - nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc, + nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc, @@ -823,13 +819,13 @@ class TabulateFusionSeRGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_grad_gpu_cuda( - dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy, + nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_r_grad_gpu_rocm( - dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size); + deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy, + nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy, @@ -876,11 +872,11 @@ class TabulateFusionSeRGradGradOp : public OpKernel { if (device == "GPU") { #if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_grad_grad_gpu_cuda( + deepmd::tabulate_fusion_se_r_grad_grad_gpu( dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size); #endif // GOOGLE_CUDA #if TENSORFLOW_USE_ROCM - deepmd::tabulate_fusion_se_r_grad_grad_gpu_rocm( + deepmd::tabulate_fusion_se_r_grad_grad_gpu( dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size); #endif // TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024),