From 9fb24a323930b6bc64d69d605641e4ad9977e916 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 00:48:50 -0400 Subject: [PATCH 1/6] allocate int_temp and uint64_temp out of loop Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 159 +++++++++++-------------- 1 file changed, 69 insertions(+), 90 deletions(-) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index a8882fb5f4..8ac1ff3cd5 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -556,6 +556,28 @@ class ProdEnvMatAOp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + Tensor int_temp; + Tensor uint64_temp; + if (device == "GPU") { + // allocate temp memory only once for multiple frames + // allocate temp memory, temp memory must not be used after this + // operation! + + // used for format_nbor_list_gpu_cuda + + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + } + // loop over samples for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; @@ -586,21 +608,6 @@ class ProdEnvMatAOp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, @@ -631,21 +638,6 @@ class ProdEnvMatAOp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu_rocm(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, @@ -848,6 +840,29 @@ class ProdEnvMatROp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + Tensor int_temp; + Tensor uint64_temp; + if (device == "GPU") { + // allocate temp memory only once for multiple frames + // allocate temp memory, temp memory must not be used after this + // operation! + + // used for format_nbor_list_gpu_cuda + + TensorShape int_shape; + int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + } + // loop over samples for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; @@ -878,21 +893,6 @@ class ProdEnvMatROp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_r_gpu_cuda(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, @@ -923,21 +923,6 @@ class ProdEnvMatROp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec.size() + int_64(nloc) * sec.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_r_gpu_rocm(em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, array_longlong, @@ -1190,6 +1175,30 @@ class ProdEnvMatAMixOp : public OpKernel { } } + // must declar out of if, otherwise the memory will be destroyed! + Tensor int_temp; + Tensor uint64_temp; + if (device == "GPU") { + // allocate temp memory only once for multiple frames + // allocate temp memory, temp memory must not be used after this + // operation! + + // used for format_nbor_list_gpu_cuda + + TensorShape int_shape; + int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); + OP_REQUIRES_OK(context, + context->allocate_temp(DT_INT32, int_shape, &int_temp)); + + TensorShape uint64_shape; + uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, + &uint64_temp)); + + array_int = int_temp.flat().data(); + array_longlong = uint64_temp.flat().data(); + } + // loop over samples for (int_64 ff = 0; ff < nsamples; ++ff) { FPTYPE* em = p_em + ff * nloc * ndescrpt; @@ -1223,21 +1232,6 @@ class ProdEnvMatAMixOp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu_cuda( em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, @@ -1267,21 +1261,6 @@ class ProdEnvMatAMixOp : public OpKernel { mesh_tensor.flat().data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this - // operation! - Tensor int_temp; - TensorShape int_shape; - int_shape.AddDim(sec_a.size() + int_64(nloc) * sec_a.size() + nloc); - OP_REQUIRES_OK(context, - context->allocate_temp(DT_INT32, int_shape, &int_temp)); - Tensor uint64_temp; - TensorShape uint64_shape; - uint64_shape.AddDim(int_64(nloc) * max_nbor_size * 2); - OP_REQUIRES_OK(context, context->allocate_temp(DT_UINT64, uint64_shape, - &uint64_temp)); - array_int = int_temp.flat().data(); - array_longlong = uint64_temp.flat().data(); - // launch the gpu(nv) compute function deepmd::prod_env_mat_a_gpu_rocm( em, em_deriv, rij, nlist, coord, type, gpu_inlist, array_int, From 4b832213d510c6f8f288fdc9cad6e81b4c209cd6 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 01:12:30 -0400 Subject: [PATCH 2/6] allocate tensor_list[0,1,3,4,5,6] out of loop Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 167 ++++++++++++++++++------- 1 file changed, 122 insertions(+), 45 deletions(-) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 8ac1ff3cd5..8c4b9c2840 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -556,12 +556,53 @@ class ProdEnvMatAOp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + // must declar out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; + std::vector tensor_list(7); if (device == "GPU") { // allocate temp memory only once for multiple frames // allocate temp memory, temp memory must not be used after this // operation! + if (nei_mode != 3) { + if (nei_mode == 1) { + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + FPTYPE_shape, &tensor_list[0])); + + // Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + double_shape, &tensor_list[1])); + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &tensor_list[3])); + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &tensor_list[4])); + } + + // Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, + &tensor_list[5])); + + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &tensor_list[6])); + } // used for format_nbor_list_gpu_cuda @@ -599,7 +640,6 @@ class ProdEnvMatAOp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, @@ -840,12 +880,53 @@ class ProdEnvMatROp : public OpKernel { const FPTYPE* std = std_tensor.flat().data(); const int* p_type = type_tensor.flat().data(); + // must declar out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; + std::vector tensor_list(7); if (device == "GPU") { // allocate temp memory only once for multiple frames // allocate temp memory, temp memory must not be used after this // operation! + if (nei_mode != 3) { + if (nei_mode == 1) { + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + FPTYPE_shape, &tensor_list[0])); + + // Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + double_shape, &tensor_list[1])); + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &tensor_list[3])); + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &tensor_list[4])); + } + + // Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, + &tensor_list[5])); + + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &tensor_list[6])); + } // used for format_nbor_list_gpu_cuda @@ -884,7 +965,6 @@ class ProdEnvMatROp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, @@ -1178,10 +1258,50 @@ class ProdEnvMatAMixOp : public OpKernel { // must declar out of if, otherwise the memory will be destroyed! Tensor int_temp; Tensor uint64_temp; + std::vector tensor_list(7); if (device == "GPU") { // allocate temp memory only once for multiple frames // allocate temp memory, temp memory must not be used after this // operation! + if (nei_mode != 3) { + if (nei_mode == 1) { + // Tensor FPTYPE_temp; + TensorShape FPTYPE_shape; + FPTYPE_shape.AddDim(nall * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + FPTYPE_shape, &tensor_list[0])); + + // Tensor double_temp; + TensorShape double_shape; + double_shape.AddDim(18); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + double_shape, &tensor_list[1])); + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + cpy_shape, &tensor_list[3])); + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, t_shape, + &tensor_list[4])); + } + + // Tensor nlist_temp; + TensorShape nlist_shape; + nlist_shape.AddDim(nloc * 2); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, nlist_shape, + &tensor_list[5])); + + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + OP_REQUIRES_OK(context, context->allocate_temp(DT_INT32, jlist_shape, + &tensor_list[6])); + } // used for format_nbor_list_gpu_cuda @@ -1223,7 +1343,6 @@ class ProdEnvMatAMixOp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu( context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy, @@ -1477,14 +1596,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, const int& nloc, const int& max_cpy_trial, const float& rcut_r) { - // Tensor FPTYPE_temp; - TensorShape FPTYPE_shape; - FPTYPE_shape.AddDim(nall * 3); - tensorflow::Status status = context->allocate_temp( - DataTypeToEnum::value, FPTYPE_shape, tensor_list); - if (!status.ok()) { - return false; - } FPTYPE* tmp_coord = (*tensor_list).flat().data(); DPErrcheck(cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice)); @@ -1498,14 +1609,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, deepmd::compute_cell_info(cell_info, rcut_r, region); const int loc_cellnum = cell_info[21]; const int total_cellnum = cell_info[22]; - // Tensor double_temp; - TensorShape double_shape; - double_shape.AddDim(18); - status = context->allocate_temp(DataTypeToEnum::value, double_shape, - tensor_list + 1); - if (!status.ok()) { - return false; - } // Tensor int_temp; TensorShape int_shape; int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + @@ -1525,18 +1628,6 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); int tt; for (tt = 0; tt < max_cpy_trial; ++tt) { - // Tensor cpy_temp; - TensorShape cpy_shape; - cpy_shape.AddDim(mem_cpy * 3); - status = context->allocate_temp(DataTypeToEnum::value, cpy_shape, - tensor_list + 3); - if (!status.ok()) { - return false; - } - // Tensor t_temp; - TensorShape t_shape; - t_shape.AddDim(mem_cpy * 2); - context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4); coord_cpy = (*(tensor_list + 3)).flat().data(); type_cpy = (*(tensor_list + 4)).flat().data(); idx_mapping = type_cpy + mem_cpy; @@ -1568,14 +1659,6 @@ static int _build_nlist_gpu(OpKernelContext* context, const int& new_nall, const int& max_nnei_trial, const float& rcut_r) { - // Tensor nlist_temp; - TensorShape nlist_shape; - nlist_shape.AddDim(nloc * 2); - tensorflow::Status status = - context->allocate_temp(DT_INT32, nlist_shape, tensor_list); - if (!status.ok()) { - return false; - } ilist = (*tensor_list).flat().data(); numneigh = ilist + nloc; // Tensor jlist_temp; @@ -1584,12 +1667,6 @@ static int _build_nlist_gpu(OpKernelContext* context, std::vector firstneigh_host(nloc); int tt; for (tt = 0; tt < max_nnei_trial; ++tt) { - TensorShape jlist_shape; - jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); - status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); - if (!status.ok()) { - return false; - } jlist = (*(tensor_list + 1)).flat().data(); ind_data = jlist + nloc * mem_nnei; for (int_64 ii = 0; ii < nloc; ++ii) { From f373e4bf9cee5558bc6feb06c069264e09c45b8e Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 01:56:08 -0400 Subject: [PATCH 3/6] *=2 Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 44 ++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 8c4b9c2840..a015648ae9 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -1638,6 +1638,21 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, break; } else { mem_cpy *= 2; + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + tensorflow::Status status = context->allocate_temp( + DataTypeToEnum::value, cpy_shape, tensor_list + 3); + if (!status.ok()) { + return false; + } + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4); + if (!status.ok()) { + return false; + } } } region_dev.boxt = new_boxt; @@ -1680,6 +1695,13 @@ static int _build_nlist_gpu(OpKernelContext* context, break; } else { mem_nnei *= 2; + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + tensorflow::Status status = + context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); + if (!status.ok()) { + return false; + } } } return (tt != max_nnei_trial); @@ -1857,6 +1879,21 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, break; } else { mem_cpy *= 2; + // Tensor cpy_temp; + TensorShape cpy_shape; + cpy_shape.AddDim(mem_cpy * 3); + tensorflow::Status status = context->allocate_temp( + DataTypeToEnum::value, cpy_shape, tensor_list + 3); + if (!status.ok()) { + return false; + } + // Tensor t_temp; + TensorShape t_shape; + t_shape.AddDim(mem_cpy * 2); + status = context->allocate_temp(DT_INT32, t_shape, tensor_list + 4); + if (!status.ok()) { + return false; + } } } region_dev.boxt = new_boxt; @@ -1913,6 +1950,13 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context, break; } else { mem_nnei *= 2; + TensorShape jlist_shape; + jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); + tensorflow::Status status = + context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); + if (!status.ok()) { + return false; + } } } return (tt != max_nnei_trial); From c8654cac0bf557e3c58ef033b37ac420de4c73a1 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 01:59:14 -0400 Subject: [PATCH 4/6] clean rocm Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 42 +++----------------------- 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index a015648ae9..a1b111b5e7 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -1815,11 +1815,6 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, const int& nloc, const int& max_cpy_trial, const float& rcut_r) { - // Tensor FPTYPE_temp; - TensorShape FPTYPE_shape; - FPTYPE_shape.AddDim(nall * 3); - context->allocate_temp(DataTypeToEnum::value, FPTYPE_shape, - tensor_list); FPTYPE* tmp_coord = (*tensor_list).flat().data(); DPErrcheck(hipMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, hipMemcpyDeviceToDevice)); @@ -1833,20 +1828,16 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, deepmd::compute_cell_info(cell_info, rcut_r, region); const int loc_cellnum = cell_info[21]; const int total_cellnum = cell_info[22]; - // Tensor double_temp; - TensorShape double_shape; - double_shape.AddDim(18); - tensorflow::Status status = context->allocate_temp( - DataTypeToEnum::value, double_shape, tensor_list + 1); - if (!status.ok()) { - return false; - } // Tensor int_temp; TensorShape int_shape; int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2); + tensorflow::Status status = + context->allocate_temp(DT_INT32, int_shape, tensor_list + 2); + if (!status.ok()) { + return false; + } FPTYPE* box_info_dev = (*(tensor_list + 1)).flat().data(); int* cell_info_dev = (*(tensor_list + 2)).flat().data(); int* int_data_dev = cell_info_dev + 23; @@ -1860,15 +1851,6 @@ static int _norm_copy_coord_gpu_rocm(OpKernelContext* context, deepmd::normalize_coord_gpu_rocm(tmp_coord, nall, region_dev); int tt; for (tt = 0; tt < max_cpy_trial; ++tt) { - // Tensor cpy_temp; - TensorShape cpy_shape; - cpy_shape.AddDim(mem_cpy * 3); - context->allocate_temp(DataTypeToEnum::value, cpy_shape, - tensor_list + 3); - // Tensor t_temp; - TensorShape t_shape; - t_shape.AddDim(mem_cpy * 2); - context, context->allocate_temp(DT_INT32, t_shape, tensor_list + 4); coord_cpy = (*(tensor_list + 3)).flat().data(); type_cpy = (*(tensor_list + 4)).flat().data(); idx_mapping = type_cpy + mem_cpy; @@ -1915,14 +1897,6 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context, const int& new_nall, const int& max_nnei_trial, const float& rcut_r) { - // Tensor nlist_temp; - TensorShape nlist_shape; - nlist_shape.AddDim(nloc * 2); - tensorflow::Status status = - context->allocate_temp(DT_INT32, nlist_shape, tensor_list); - if (!status.ok()) { - return false; - } ilist = (*tensor_list).flat().data(); numneigh = ilist + nloc; // Tensor jlist_temp; @@ -1931,12 +1905,6 @@ static int _build_nlist_gpu_rocm(OpKernelContext* context, std::vector firstneigh_host(nloc); int tt; for (tt = 0; tt < max_nnei_trial; ++tt) { - TensorShape jlist_shape; - jlist_shape.AddDim(3 * int_64(nloc) * mem_nnei); - status = context->allocate_temp(DT_INT32, jlist_shape, tensor_list + 1); - if (!status.ok()) { - return false; - } jlist = (*(tensor_list + 1)).flat().data(); ind_data = jlist + nloc * mem_nnei; for (int_64 ii = 0; ii < nloc; ++ii) { From 25c25be6a0e1fbc1ad0477f367f9a67e96f98673 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 02:08:54 -0400 Subject: [PATCH 5/6] fix status Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index a1b111b5e7..1ac2071611 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -1614,7 +1614,11 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, int_shape.AddDim(23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1 + nloc); - context, context->allocate_temp(DT_INT32, int_shape, tensor_list + 2); + tensorflow::Status status = + context->allocate_temp(DT_INT32, int_shape, tensor_list + 2); + if (!status.ok()) { + return false; + } FPTYPE* box_info_dev = (*(tensor_list + 1)).flat().data(); int* cell_info_dev = (*(tensor_list + 2)).flat().data(); int* int_data_dev = cell_info_dev + 23; @@ -1641,8 +1645,8 @@ static int _norm_copy_coord_gpu(OpKernelContext* context, // Tensor cpy_temp; TensorShape cpy_shape; cpy_shape.AddDim(mem_cpy * 3); - tensorflow::Status status = context->allocate_temp( - DataTypeToEnum::value, cpy_shape, tensor_list + 3); + status = context->allocate_temp(DataTypeToEnum::value, cpy_shape, + tensor_list + 3); if (!status.ok()) { return false; } From 27d8e5f2296155a8fed44002ffea2355c686f823 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Sat, 16 Sep 2023 02:57:10 -0400 Subject: [PATCH 6/6] clean rocm Signed-off-by: Jinzhe Zeng --- source/op/prod_env_mat_multi_device.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/source/op/prod_env_mat_multi_device.cc b/source/op/prod_env_mat_multi_device.cc index 1ac2071611..862fb5856f 100644 --- a/source/op/prod_env_mat_multi_device.cc +++ b/source/op/prod_env_mat_multi_device.cc @@ -669,7 +669,6 @@ class ProdEnvMatAOp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu_rocm( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, @@ -994,7 +993,6 @@ class ProdEnvMatROp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu_rocm( context, &tensor_list[0], &coord, coord_cpy, &type, type_cpy, @@ -1371,7 +1369,6 @@ class ProdEnvMatAMixOp : public OpKernel { int* type_cpy; int frame_nall = nall; int mesh_tensor_size = static_cast(mesh_tensor.NumElements()); - std::vector tensor_list(7); // prepare coord and nlist _prepare_coord_nlist_gpu_rocm( context, &tensor_list[0], &coord, coord_cpy, &f_type, type_cpy,