From 938cea286dff1e54dd5ad9784ffd36799c2543ca Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Tue, 26 Sep 2023 03:11:22 -0400 Subject: [PATCH] merge CUDA and ROCm codes in op (#2847) Signed-off-by: Jinzhe Zeng --- source/lib/include/gpu_cuda.h | 1 + source/lib/include/gpu_rocm.h | 1 + source/op/gelu_multi_device.cc | 24 +- source/op/prod_env_mat_multi_device.cc | 454 +------------------ source/op/prod_env_mat_multi_device_nvnmd.cc | 16 +- source/op/prod_force_grad_multi_device.cc | 18 +- source/op/prod_force_multi_device.cc | 18 +- source/op/prod_virial_grad_multi_device.cc | 18 +- source/op/prod_virial_multi_device.cc | 18 +- source/op/tabulate_multi_device.cc | 102 +---- source/op/unaggregated_grad.cc | 4 +- 11 files changed, 66 insertions(+), 608 deletions(-) diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h index 73dfed1404..1e750e0ea0 100644 --- a/source/lib/include/gpu_cuda.h +++ b/source/lib/include/gpu_cuda.h @@ -13,6 +13,7 @@ #define gpuMemcpy cudaMemcpy #define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost #define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice #define gpuMemset cudaMemset #define GPU_MAX_NBOR_SIZE 4096 diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h index 3a65a57b01..bb404720bc 100644 --- a/source/lib/include/gpu_rocm.h +++ b/source/lib/include/gpu_rocm.h @@ -16,6 +16,7 @@ #define gpuMemcpy hipMemcpy #define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost #define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice #define gpuMemset hipMemset #define DPErrcheck(res) \ diff --git a/source/op/gelu_multi_device.cc b/source/op/gelu_multi_device.cc index ccc95aa0e4..1c76cd25d3 100644 --- a/source/op/gelu_multi_device.cc +++ b/source/op/gelu_multi_device.cc @@ -64,13 +64,9 @@ class GeluOp : public OpKernel { const int_64 size = static_cast(output_tensor->NumElements()); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::gelu_gpu(out, x, size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::gelu_gpu(out, x, size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_cpu(out, x, size); } @@ -108,13 +104,9 @@ class GeluGradOp : public OpKernel { const int_64 size = static_cast(output_tensor->NumElements()); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::gelu_grad_gpu(out, x, dy, size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::gelu_grad_gpu(out, x, dy, size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_cpu(out, x, dy, size); } @@ -154,13 +146,9 @@ class GeluGradGradOp : public OpKernel { const int_64 size = static_cast(output_tensor->NumElements()); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::gelu_grad_grad_gpu(out, x, dy, dy_2, size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::gelu_grad_grad_cpu(out, x, dy, dy_2, size); } diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index ee07dc22fe..47541bc69f 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -1,6 +1,7 @@ // SPDX-License-Identifier: LGPL-3.0-or-later #include "coord.h" #include "custom_op.h" +#include "device.h" #include "errors.h" #include "neighbor_list.h" #include "prod_env_mat.h" @@ -243,82 +244,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context, const int& max_cpy_trial, const int& max_nnei_trial); -#if GOOGLE_CUDA -template -static int _norm_copy_coord_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE*& coord_cpy, - int*& type_cpy, - int*& idx_mapping, - int& nall, - int& mem_cpy, - const FPTYPE* coord, - const FPTYPE* box, - const int* type, - const int& nloc, - const int& max_cpy_trial, - const float& rcut_r); - -template -static int _build_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int& max_nnei, - int& mem_nnei, - const FPTYPE* coord, - const int& nloc, - const int& new_nall, - const int& max_nnei_trial, - const float& rcut_r); - -static void _map_nlist_gpu(int* nlist, - const int* idx_mapping, - const int& nloc, - const int& nnei); - -static void _map_nei_info_gpu(int* nlist, - int* ntype, - bool* nmask, - const int* type, - const int* idx_mapping, - const int& nloc, - const int& nnei, - const int& ntypes, - const bool& b_nlist_map); - -template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial); - -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template static int _norm_copy_coord_gpu(OpKernelContext* context, Tensor* tensor_list, @@ -391,7 +317,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context, const int& max_cpy_trial, const int& max_nnei_trial); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM template class ProdEnvMatAOp : public OpKernel { @@ -633,36 +559,7 @@ class ProdEnvMatAOp : public OpKernel { const int* type = p_type + ff * nall; if (device == "GPU") { -#if GOOGLE_CUDA - int* idx_mapping = NULL; - int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); - int* jlist = NULL; - FPTYPE* coord_cpy; - int* type_cpy; - int frame_nall = nall; - int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut_r, max_cpy_trial, max_nnei_trial); - - // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, - rcut_r, rcut_r_smth, sec_a); - if (b_nlist_map) { - _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); - } - deepmd::delete_device_memory(firstneigh); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; int** firstneigh = NULL; @@ -689,7 +586,7 @@ class ProdEnvMatAOp : public OpKernel { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } deepmd::delete_device_memory(firstneigh); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; // some buffers, be freed after the evaluation of this frame @@ -960,36 +857,7 @@ class ProdEnvMatROp : public OpKernel { const int* type = p_type + ff * nall; if (device == "GPU") { -#if GOOGLE_CUDA - int* idx_mapping = NULL; - int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); - int* jlist = NULL; - FPTYPE* coord_cpy; - int* type_cpy; - int frame_nall = nall; - int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut, max_cpy_trial, max_nnei_trial); - - // launch the gpu(nv) compute function - deepmd::prod_env_mat_r_gpu(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, - rcut, rcut_smth, sec); - if (b_nlist_map) { - _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); - } - deepmd::delete_device_memory(firstneigh); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; int** firstneigh = NULL; @@ -1016,7 +884,7 @@ class ProdEnvMatROp : public OpKernel { _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); } deepmd::delete_device_memory(firstneigh); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; // some buffers, be freed after the evaluation of this frame @@ -1248,10 +1116,7 @@ class ProdEnvMatAMixOp : public OpKernel { int* p_f_type = fake_type_tensor.flat().data(); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall); -#endif -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::filter_ftype_gpu(p_f_type, p_type, nsamples * nall); #endif } else if (device == "CPU") { @@ -1338,35 +1203,7 @@ class ProdEnvMatAMixOp : public OpKernel { const int* f_type = p_f_type + ff * nall; if (device == "GPU") { -#if GOOGLE_CUDA - int* idx_mapping = NULL; - int *ilist = NULL, *numneigh = NULL; - int** firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); - int* jlist = NULL; - FPTYPE* coord_cpy; - int* type_cpy; - int frame_nall = nall; - int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - // prepare coord and nlist - _prepare_coord_nlist_gpu( - context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, - idx_mapping, gpu_inlist, ilist, numneigh, firstneigh, jlist, - nbor_list_dev, frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, - mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, - rcut_r, max_cpy_trial, max_nnei_trial); - - // launch the gpu(nv) compute function - deepmd::prod_env_mat_a_gpu(em, em_deriv, rij, nlist, coord, type, - gpu_inlist, array_int, array_longlong, - max_nbor_size, avg, std, nloc, frame_nall, - rcut_r, rcut_r_smth, sec_a, f_type); - _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, - ntypes, b_nlist_map); - deepmd::delete_device_memory(firstneigh); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM int* idx_mapping = NULL; int *ilist = NULL, *numneigh = NULL; int** firstneigh = NULL; @@ -1392,7 +1229,7 @@ class ProdEnvMatAMixOp : public OpKernel { _map_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, ntypes, b_nlist_map); deepmd::delete_device_memory(firstneigh); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; // some buffers, be freed after the evaluation of this frame @@ -1616,7 +1453,7 @@ static void _prepare_coord_nlist_cpu(OpKernelContext* context, } } -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template static int _norm_copy_coord_gpu(OpKernelContext* context, Tensor* tensor_list, @@ -1632,8 +1469,8 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, const int& max_cpy_trial, const float& rcut_r) { FPTYPE* tmp_coord = (*tensor_list).flat().data(); - DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, - cudaMemcpyDeviceToDevice)); + DPErrcheck(gpuMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, + gpuMemcpyDeviceToDevice)); deepmd::Region region; init_region_cpu(region, box); @@ -1877,270 +1714,7 @@ static void _prepare_coord_nlist_gpu(OpKernelContext* context, ", which currently is not supported by deepmd-kit.")); } } -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -template -static int _norm_copy_coord_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE*& coord_cpy, - int*& type_cpy, - int*& idx_mapping, - int& nall, - int& mem_cpy, - const FPTYPE* coord, - const FPTYPE* box, - const int* type, - const int& nloc, - const int& max_cpy_trial, - const float& rcut_r) { - FPTYPE* tmp_coord = (*tensor_list).flat().data(); - DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, - hipMemcpyDeviceToDevice)); - - deepmd::Region region; - init_region_cpu(region, box); - FPTYPE box_info[18]; - std::copy(region.boxt, region.boxt + 9, box_info); - std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9); - int cell_info[23]; - deepmd::compute_cell_info(cell_info, rcut_r, region); - const int loc_cellnum = cell_info[21]; - const int total_cellnum = cell_info[22]; - // Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + - total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + - nloc); - tensorflow::Status status = - context->allocate_temp(DT_INT32, int_shape, tensor_list + 2); - if (!status.ok()) { - return false; - } - FPTYPE* box_info_dev = (*(tensor_list + 1)).flat().data(); - int* cell_info_dev = (*(tensor_list + 2)).flat().data(); - int* int_data_dev = cell_info_dev + 23; - deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); - deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); - deepmd::Region region_dev; - FPTYPE* new_boxt = region_dev.boxt; - FPTYPE* new_rec_boxt = region_dev.rec_boxt; - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; - deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); - int tt; - for (tt = 0; tt < max_cpy_trial; ++tt) { - coord_cpy = (*(tensor_list + 3)).flat().data(); - type_cpy = (*(tensor_list + 4)).flat().data(); - idx_mapping = type_cpy + mem_cpy; - int ret = deepmd::copy_coord_gpu( - coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type, - nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - if (ret == 0) { - break; - } else { - mem_cpy *= 2; - // Tensor cpy_temp; - TensorShape cpy_shape; - cpy_shape.AddDim(mem_cpy * 3); - tensorflow::Status status = context->allocate_temp( - DataTypeToEnum::value, cpy_shape, tensor_list + 3); - if (!status.ok()) { - return false; - } - // Tensor t_temp; - TensorShape t_shape; - t_shape.AddDim(mem_cpy * 2); - status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4); - if (!status.ok()) { - return false; - } - } - } - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; - return (tt != max_cpy_trial); -} - -template -static int _build_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int& max_nnei, - int& mem_nnei, - const FPTYPE* coord, - const int& nloc, - const int& new_nall, - const int& max_nnei_trial, - const float& rcut_r) { - ilist = (*tensor_list).flat().data(); - numneigh = ilist + nloc; - // Tensor jlist_temp; - int* ind_data = NULL; - - std::vector firstneigh_host(nloc); - int tt; - for (tt = 0; tt < max_nnei_trial; ++tt) { - jlist = (*(tensor_list + 1)).flat().data(); - ind_data = jlist + nloc * mem_nnei; - for (int_64 ii = 0; ii < nloc; ++ii) { - firstneigh_host[ii] = jlist + ii * mem_nnei; - } - deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); - deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); - int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc, - new_nall, mem_nnei, rcut_r); - if (ret == 0) { - break; - } else { - mem_nnei *= 2; - TensorShape jlist_shape; - jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); - tensorflow::Status status = - context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); - if (!status.ok()) { - return false; - } - } - } - return (tt != max_nnei_trial); -} - -static void _map_nlist_gpu(int* nlist, - const int* idx_mapping, - const int& nloc, - const int& nnei) { - deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei); -} - -static void _map_nei_info_gpu(int* nlist, - int* ntype, - bool* nmask, - const int* type, - const int* idx_mapping, - const int& nloc, - const int& nnei, - const int& ntypes, - const bool& b_nlist_map) { - deepmd::use_nei_info_gpu(nlist, ntype, nmask, type, idx_mapping, nloc, nnei, - ntypes, b_nlist_map); -} - -template -static void _prepare_coord_nlist_gpu(OpKernelContext* context, - Tensor* tensor_list, - FPTYPE const** coord, - FPTYPE*& coord_cpy, - int const** type, - int*& type_cpy, - int*& idx_mapping, - deepmd::InputNlist& inlist, - int*& ilist, - int*& numneigh, - int**& firstneigh, - int*& jlist, - int*& nbor_list_dev, - int& new_nall, - int& mem_cpy, - int& mem_nnei, - int& max_nbor_size, - const FPTYPE* box, - const int* mesh_tensor_data, - const int mesh_tensor_size, - const int& nloc, - const int& nei_mode, - const float& rcut_r, - const int& max_cpy_trial, - const int& max_nnei_trial) { - if (nei_mode != 3 && nei_mode != 4) { - inlist.inum = nloc; - // build nlist by myself - // normalize and copy coord - if (nei_mode == 1) { - int copy_ok = _norm_copy_coord_gpu( - context, tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, - mem_cpy, *coord, box, *type, nloc, max_cpy_trial, rcut_r); - OP_REQUIRES(context, copy_ok, - errors::Aborted("cannot allocate mem for copied coords")); - *coord = coord_cpy; - *type = type_cpy; - } - // build nlist - int build_ok = - _build_nlist_gpu(context, tensor_list + 5, ilist, numneigh, firstneigh, - jlist, max_nbor_size, mem_nnei, *coord, nloc, new_nall, - max_nnei_trial, rcut_r); - OP_REQUIRES(context, build_ok, - errors::Aborted("cannot allocate mem for nlist")); - if (max_nbor_size <= 1024) { - max_nbor_size = 1024; - } else if (max_nbor_size <= 2048) { - max_nbor_size = 2048; - } else { - max_nbor_size = 4096; - } - inlist.ilist = ilist; - inlist.numneigh = numneigh; - inlist.firstneigh = firstneigh; - } else if (nei_mode == 4) { - // TODO: in theory, it will be faster to put everything on GPUs... - std::vector mesh_tensor_data_host(mesh_tensor_size); - std::vector ilist_host(nloc); - std::vector numneigh_host(nloc); - std::vector firstneigh_host(nloc); - std::vector fake_mesh(16); - - // copy from gpu to cpu - deepmd::memcpy_device_to_host(mesh_tensor_data, mesh_tensor_data_host); - std::memcpy(&ilist_host[0], &mesh_tensor_data_host[16], sizeof(int) * nloc); - std::memcpy(&numneigh_host[0], &mesh_tensor_data_host[16 + nloc], - sizeof(int) * nloc); - for (int ii = 0, kk = 0; ii < nloc; ++ii) { - firstneigh_host[ii] = &mesh_tensor_data_host[16 + 2 * nloc + kk]; - kk += numneigh_host[ii]; - } - // make a fake mesh - fake_mesh[0] = 0; - fake_mesh[1] = nloc; - std::memcpy(&fake_mesh[4], &ilist_host, sizeof(int*)); - std::memcpy(&fake_mesh[8], &numneigh_host, sizeof(int*)); - std::memcpy(&fake_mesh[12], &firstneigh_host, sizeof(int**)); - // copy from cpu to gpu - int* fake_mesh_dev = NULL; - deepmd::malloc_device_memory(fake_mesh_dev, 16); - deepmd::memcpy_host_to_device(fake_mesh_dev, fake_mesh); - - deepmd::InputNlist inlist_temp; - inlist_temp.inum = nloc; - // everything should be copied to GPU... - deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size, - nbor_list_dev, fake_mesh_dev, 16); - OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size), - errors::InvalidArgument( - "Assert failed, max neighbor size of atom(lammps) " + - std::to_string(max_numneigh(inlist_temp)) + - " is larger than " + std::to_string(max_nbor_size) + - ", which currently is not supported by deepmd-kit.")); - deepmd::delete_device_memory(fake_mesh_dev); - } else { - // update nbor list - deepmd::InputNlist inlist_temp; - inlist_temp.inum = nloc; - deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size, - nbor_list_dev, mesh_tensor_data, - mesh_tensor_size); - OP_REQUIRES(context, (max_numneigh(inlist_temp) <= max_nbor_size), - errors::InvalidArgument( - "Assert failed, max neighbor size of atom(lammps) " + - std::to_string(max_numneigh(inlist_temp)) + - " is larger than " + std::to_string(max_nbor_size) + - ", which currently is not supported by deepmd-kit.")); - } -} -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Register the CPU kernels. // Compatible with v1.3 diff --git a/source/op/prod_env_mat_multi_device_nvnmd.cc b/source/op/prod_env_mat_multi_device_nvnmd.cc index b5863d1b71..abca947f0a 100644 --- a/source/op/prod_env_mat_multi_device_nvnmd.cc +++ b/source/op/prod_env_mat_multi_device_nvnmd.cc @@ -471,13 +471,9 @@ class ProdEnvMatANvnmdQuantizeOp : public OpKernel { const int* type = p_type + ff * nall; if (device == "GPU") { -#if GOOGLE_CUDA -// UNDEFINE -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // UNDEFINE -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; // some buffers, be freed after the evaluation of this frame @@ -720,13 +716,9 @@ class ProdEnvMatAMixNvnmdQuantizeOp : public OpKernel { const int* type = p_type + ff * nall; if (device == "GPU") { -#if GOOGLE_CUDA -// UNDEFINE -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // UNDEFINE -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::InputNlist inlist; // some buffers, be freed after the evaluation of this frame diff --git a/source/op/prod_force_grad_multi_device.cc b/source/op/prod_force_grad_multi_device.cc index 7d8a664a8d..ffcd8f0b8b 100644 --- a/source/op/prod_force_grad_multi_device.cc +++ b/source/op/prod_force_grad_multi_device.cc @@ -121,15 +121,10 @@ class ProdForceSeAGradOp : public OpKernel { const int* p_nlist = nlist_tensor.flat().data(); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, - nloc, nnei, nframes); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_force_grad_a_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc, nnei, nframes); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc, nnei, nframes); @@ -234,15 +229,10 @@ class ProdForceSeRGradOp : public OpKernel { const int* p_nlist = nlist_tensor.flat().data(); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, - nloc, nnei, nframes); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_force_grad_r_gpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc, nnei, nframes); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_grad_r_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc, nnei, nframes); diff --git a/source/op/prod_force_multi_device.cc b/source/op/prod_force_multi_device.cc index 9d553b1f0c..935b5b9f2f 100644 --- a/source/op/prod_force_multi_device.cc +++ b/source/op/prod_force_multi_device.cc @@ -142,15 +142,10 @@ class ProdForceSeAOp : public OpKernel { } if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, - nall, nnei, nframes); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_force_a_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, nall, nnei, nframes); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_a_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, nall, nnei, nframes, nloc_loc, @@ -228,15 +223,10 @@ class ProdForceSeROp : public OpKernel { const int* p_nlist = nlist_tensor.flat().data(); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, - nall, nnei, nframes); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_force_r_gpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, nall, nnei, nframes); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_force_r_cpu(p_force, p_net_deriv, p_in_deriv, p_nlist, nloc, nall, nnei, nframes); diff --git a/source/op/prod_virial_grad_multi_device.cc b/source/op/prod_virial_grad_multi_device.cc index ef7d10b3bd..d3e7025e6e 100644 --- a/source/op/prod_virial_grad_multi_device.cc +++ b/source/op/prod_virial_grad_multi_device.cc @@ -142,15 +142,10 @@ class ProdVirialSeAGradOp : public OpKernel { const FPTYPE* rij = p_rij + kk * nloc * nnei * 3; const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist, - nloc, nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_virial_grad_a_gpu(grad_net, grad, in_deriv, rij, nlist, nloc, nnei); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist, nloc, nnei); @@ -275,15 +270,10 @@ class ProdVirialSeRGradOp : public OpKernel { const FPTYPE* rij = p_rij + kk * nloc * nnei * 3; const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist, - nloc, nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_virial_grad_r_gpu(grad_net, grad, in_deriv, rij, nlist, nloc, nnei); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_grad_r_cpu(grad_net, grad, in_deriv, rij, nlist, nloc, nnei); diff --git a/source/op/prod_virial_multi_device.cc b/source/op/prod_virial_multi_device.cc index e3960fc37d..445770e85a 100644 --- a/source/op/prod_virial_multi_device.cc +++ b/source/op/prod_virial_multi_device.cc @@ -120,15 +120,10 @@ class ProdVirialSeAOp : public OpKernel { const FPTYPE* rij = p_rij + kk * nloc * nnei * 3; const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij, - nlist, nloc, nall, nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_virial_a_gpu(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); @@ -224,15 +219,10 @@ class ProdVirialSeROp : public OpKernel { const FPTYPE* rij = p_rij + kk * nloc * nnei * 3; const int* nlist = p_nlist + kk * nloc * nnei; if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij, - nlist, nloc, nall, nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::prod_virial_r_gpu(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::prod_virial_r_cpu(virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); diff --git a/source/op/tabulate_multi_device.cc b/source/op/tabulate_multi_device.cc index 886b9d9a6d..85ea82803a 100644 --- a/source/op/tabulate_multi_device.cc +++ b/source/op/tabulate_multi_device.cc @@ -196,15 +196,10 @@ class TabulateFusionSeAOp : public OpKernel { const int nnei = em_tensor.shape().dim_size(1); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, - two_embed, nloc, nnei, last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, last_layer_size); @@ -266,17 +261,11 @@ class TabulateFusionSeAGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, nnei, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, nnei, @@ -330,16 +319,11 @@ class TabulateFusionSeAGradGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_grad_gpu( - dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, - nnei, last_layer_size, is_sorted); -#endif // GOOGLE_CUDA -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei, last_layer_size, is_sorted); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( "In the process of model compression, the size of the " @@ -408,17 +392,11 @@ class TabulateFusionSeAttenOp : public OpKernel { const int nnei = em_tensor.shape().dim_size(1); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, - two_embed, nloc, nnei, last_layer_size, - is_sorted); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_gpu(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, last_layer_size, is_sorted); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_cpu(descriptor, table, table_info, em_x, em, two_embed, nloc, nnei, last_layer_size, @@ -489,17 +467,11 @@ class TabulateFusionSeAttenGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, two_embed, dy, nloc, nnei, - last_layer_size, is_sorted); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_a_grad_gpu(dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, nnei, last_layer_size, is_sorted); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_a_grad_cpu(dy_dem_x, dy_dem, table, table_info, em_x, em, two_embed, dy, nloc, nnei, @@ -559,15 +531,10 @@ class TabulateFusionSeTOp : public OpKernel { const int nnei_j = em_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em, - nloc, nnei_i, nnei_j, last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_t_gpu(descriptor, table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_cpu(descriptor, table, table_info, em_x, em, nloc, nnei_i, nnei_j, last_layer_size); @@ -627,17 +594,11 @@ class TabulateFusionSeTGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(1); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info, - em_x, em, dy, nloc, nnei_i, nnei_j, - last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_t_grad_gpu(dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_t_grad_cpu(dy_dem_x, dy_dem, table, table_info, em_x, em, dy, nloc, nnei_i, nnei_j, @@ -690,16 +651,11 @@ class TabulateFusionSeTGradGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(1); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_t_grad_grad_gpu( - dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, - nnei_i, nnei_j, last_layer_size); -#endif // GOOGLE_CUDA -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_t_grad_grad_gpu( dz_dy, table, table_info, em_x, em, dz_dy_dem_x, dz_dy_dem, nloc, nnei_i, nnei_j, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( "In the process of model compression, the size of the " @@ -758,15 +714,10 @@ class TabulateFusionSeROp : public OpKernel { const int nnei = em_tensor.shape().dim_size(1); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc, - nnei, last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_gpu(descriptor, table, table_info, em, nloc, nnei, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_cpu(descriptor, table, table_info, em, nloc, nnei, last_layer_size); @@ -818,15 +769,10 @@ class TabulateFusionSeRGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy, - nloc, nnei, last_layer_size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_grad_gpu(dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } else if (device == "CPU") { deepmd::tabulate_fusion_se_r_grad_cpu(dy_dem, table, table_info, em, dy, nloc, nnei, last_layer_size); @@ -871,14 +817,10 @@ class TabulateFusionSeRGradGradOp : public OpKernel { const int last_layer_size = descriptor_tensor.shape().dim_size(2); if (device == "GPU") { -#if GOOGLE_CUDA - deepmd::tabulate_fusion_se_r_grad_grad_gpu( - dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size); -#endif // GOOGLE_CUDA -#if TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM deepmd::tabulate_fusion_se_r_grad_grad_gpu( dz_dy, table, table_info, em, dz_dy_dem, nloc, nnei, last_layer_size); -#endif // TENSORFLOW_USE_ROCM +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM OP_REQUIRES(context, (last_layer_size <= 1024), errors::InvalidArgument( "In the process of model compression, the size of the " diff --git a/source/op/unaggregated_grad.cc b/source/op/unaggregated_grad.cc index bc67a9fac9..9a61a3bac9 100644 --- a/source/op/unaggregated_grad.cc +++ b/source/op/unaggregated_grad.cc @@ -490,7 +490,7 @@ REGISTER_CPU(float); REGISTER_CPU(double); // Not required in the current situation // // Register the GPU kernels. -// #if GOOGLE_CUDA +// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // #define REGISTER_GPU(T) \ // REGISTER_KERNEL_BUILDER( \ // Name("UnaggregatedDyDxS").Device(DEVICE_GPU).TypeConstraint("T"), \ @@ -500,4 +500,4 @@ REGISTER_CPU(double); // UnaggregatedDyDxOp); // REGISTER_GPU(float); // REGISTER_GPU(double); -// #endif // GOOGLE_CUDA +// #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM