diff --git a/deepmd/descriptor/se_a.py b/deepmd/descriptor/se_a.py index c29b667143..d22db2d41b 100644 --- a/deepmd/descriptor/se_a.py +++ b/deepmd/descriptor/se_a.py @@ -147,6 +147,7 @@ def __init__( ) self.sel_a = sel self.rcut_r = rcut + self.register_buffer("buffer_rcut", paddle.to_tensor(rcut, dtype="float64")) self.rcut_r_smth = rcut_smth self.filter_neuron = neuron self.n_axis_neuron = axis_neuron @@ -175,10 +176,12 @@ def __init__( self.sel_a.extend(self.sel_a_spin) else: self.ntypes_spin = 0 + self.register_buffer("buffer_ntypes_spin", paddle.to_tensor(self.ntypes_spin)) # descrpt config self.sel_r = [0 for ii in range(len(self.sel_a))] self.ntypes = len(self.sel_a) + self.register_buffer("buffer_ntypes", paddle.to_tensor(self.ntypes)) assert self.ntypes == len(self.sel_r) self.rcut_a = -1 # numb of neighbors and numb of descrptors diff --git a/deepmd/entrypoints/freeze.py b/deepmd/entrypoints/freeze.py index 121b6c77a6..3582c3eb96 100755 --- a/deepmd/entrypoints/freeze.py +++ b/deepmd/entrypoints/freeze.py @@ -424,7 +424,20 @@ def freeze_graph( False, ], ) - paddle.jit.save(st_model, output) + print(f"st_model.descrpt.buffer_rcut.name = {st_model.descrpt.buffer_rcut.name}") + print( + f"st_model.descrpt.buffer_ntypes.name = {st_model.descrpt.buffer_ntypes.name}" + ) + print( + f"st_model.fitting.buffer_dfparam.name = {st_model.fitting.buffer_dfparam.name}" + ) + print( + f"st_model.fitting.buffer_daparam.name = {st_model.fitting.buffer_daparam.name}" + ) + # 跳过对program的裁剪,从而保留rcut、ntypes等不参与前向的参数,从而在C++端可以获取这些参数 + skip_prune_program = True + print(f"==>> skip_prune_program = {skip_prune_program}") + paddle.jit.save(st_model, output, skip_prune_program=skip_prune_program) print(f"Saved to path: {output}") diff --git a/deepmd/fit/ener.py b/deepmd/fit/ener.py index 036bdc54f9..a13ddec13d 100644 --- a/deepmd/fit/ener.py +++ b/deepmd/fit/ener.py @@ -140,7 +140,9 @@ def __init__( # .add("precision", str, default = "default")\ # .add("trainable", [list, bool], default = True) self.numb_fparam = numb_fparam + self.register_buffer("buffer_dfparam", paddle.to_tensor(self.numb_fparam)) self.numb_aparam = numb_aparam + self.register_buffer("buffer_daparam", paddle.to_tensor(self.numb_aparam)) self.n_neuron = neuron self.resnet_dt = resnet_dt self.rcond = rcond diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py index 6d41b91506..7b0b0d5536 100644 --- a/deepmd/infer/deep_eval.py +++ b/deepmd/infer/deep_eval.py @@ -94,17 +94,23 @@ def __init__( for k, v in load_state_dict.items(): if k in self.model.state_dict(): if load_state_dict[k].dtype != self.model.state_dict()[k].dtype: - # print(f"convert dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}") + print( + f"convert {k}'s dtype from {load_state_dict[k].dtype} to {self.model.state_dict()[k].dtype}" + ) load_state_dict[k] = load_state_dict[k].astype( self.model.state_dict()[k].dtype ) if list(load_state_dict[k].shape) != list( self.model.state_dict()[k].shape ): - # print(f"convert shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}") + print( + f"convert {k}'s shape from {load_state_dict[k].shape} to {self.model.state_dict()[k].shape}" + ) load_state_dict[k] = load_state_dict[k].reshape( self.model.state_dict()[k].shape ) + # print(f"==>> Load pretraied model successfully from: {str(model_file)}") + # exit() self.model.set_state_dict(load_state_dict) self.load_prefix = load_prefix diff --git a/deepmd/infer/deep_pot.py b/deepmd/infer/deep_pot.py index 909a9d23ac..b0ade1fc1a 100644 --- a/deepmd/infer/deep_pot.py +++ b/deepmd/infer/deep_pot.py @@ -636,6 +636,17 @@ def _eval_inner( eval_inputs["box"], # [45] paddle.float64 eval_inputs["default_mesh"], # [6] paddle.int32 ) + # print(eval_inputs["coord"].shape) + # print(eval_inputs["type"].shape) + # print(eval_inputs["natoms_vec"].shape) + # print(eval_inputs["box"].shape) + # print(eval_inputs["default_mesh"].shape) + # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/coord.npy", eval_inputs["coord"].numpy()) + # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/type.npy", eval_inputs["type"].numpy()) + # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/natoms_vec.npy", eval_inputs["natoms_vec"].numpy()) + # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/box.npy", eval_inputs["box"].numpy()) + # np.save("/workspace/hesensen/deepmd_backend/python_infer_data/default_mesh.npy", eval_inputs["default_mesh"].numpy()) + # exit() eval_outputs = { "atom_ener": eval_outputs[0], "atom_virial": eval_outputs[1], @@ -656,6 +667,12 @@ def _eval_inner( # "xx1": eval_outputs[9], # "hidden1": eval_outputs[10], } + + # for k, v in eval_outputs.items(): + # print(k, v.shape) + # np.save(f"/workspace/hesensen/deepmd_backend/python_infer_data/st_model_{k}.npy", v.numpy()) + # print(f"finished save {k}") + # exit() else: eval_outputs = self.model( eval_inputs["coord"], # [2880] paddle.float64 diff --git a/examples/water/lmp/Model_1000000_with_buffer.pdiparams b/examples/water/lmp/Model_1000000_with_buffer.pdiparams new file mode 100644 index 0000000000..3b4a25e6ba Binary files /dev/null and b/examples/water/lmp/Model_1000000_with_buffer.pdiparams differ diff --git a/examples/water/lmp/Model_1000000_with_buffer.pdmodel b/examples/water/lmp/Model_1000000_with_buffer.pdmodel new file mode 100644 index 0000000000..fbc0689402 Binary files /dev/null and b/examples/water/lmp/Model_1000000_with_buffer.pdmodel differ diff --git a/examples/water/lmp/in.lammps b/examples/water/lmp/in.lammps index ea3b5d52cd..5883016634 100644 --- a/examples/water/lmp/in.lammps +++ b/examples/water/lmp/in.lammps @@ -12,7 +12,7 @@ mass 1 16 mass 2 2 # See https://deepmd.rtfd.io/lammps/ for usage -pair_style deepmd frozen_model.pb +pair_style deepmd Model_1000000_with_buffer # If atom names (O H in this example) are not set in the pair_coeff command, the type_map defined by the training parameter will be used by default. pair_coeff * * O H diff --git a/examples/water/lmp/model.pb b/examples/water/lmp/model.pb new file mode 100644 index 0000000000..fa246dffba Binary files /dev/null and b/examples/water/lmp/model.pb differ diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index ccf9641795..7a617d4c02 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -2,6 +2,98 @@ cmake_minimum_required(VERSION 3.16) project(DeePMD) +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +set(PADDLE_LIB ${PADDLE_LIB} CACHE PATH "/path/paddle/lib") + +include_directories("${PADDLE_LIB}/") +set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") + +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + +# add custom operators +option(USE_TENSORRT "Compile demo with TensorRT." OFF) + +if(WITH_GPU) + if(NOT WIN32) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + else() + if(CUDA_LIB STREQUAL "") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + endif() + endif(NOT WIN32) +endif() + +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_INCLUDE_DIR}") + link_directories("${TENSORRT_LIB_DIR}") + endif() +endif(NOT WIN32) + + +if(WITH_STATIC_LIB) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + if(WIN32) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_STATIC_LIBRARY_SUFFIX}) + else() + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags protobuf xxhash + ${EXTERNAL_LIB}) +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags_static libprotobuf xxhash ${EXTERNAL_LIB}) + set(DEPS ${DEPS} shlwapi.lib) +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + if(USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + endif() +endif() + + option(BUILD_TESTING "Build test and enable converage" OFF) set(DEEPMD_C_ROOT "" @@ -175,6 +267,7 @@ if(BUILD_CPP_IF) set(LIB_DEEPMD_CC "deepmd_cc") set(LIB_DEEPMD_C "deepmd_c") if(USE_CUDA_TOOLKIT) + set(LIB_DEEPMD_OP_DEVICE "deepmd_paddle_op_cuda") set(LIB_DEEPMD_OP_DEVICE "deepmd_op_cuda") elseif(USE_ROCM_TOOLKIT) set(LIB_DEEPMD_OP_DEVICE "deepmd_op_rocm") @@ -260,6 +353,33 @@ if(BUILD_CPP_IF) endif() endif(BUILD_CPP_IF) +# if(WIN32) +# if(USE_TENSORRT) +# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD +# COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} +# ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} +# COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} +# ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} +# ) +# endif() +# if(WITH_MKL) +# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD +# COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/mklml.dll ${CMAKE_BINARY_DIR}/Release +# COMMAND ${CMAKE_COMMAND} -E copy ${MATH_LIB_PATH}/lib/libiomp5md.dll ${CMAKE_BINARY_DIR}/Release +# COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/Release +# ) +# else() +# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD +# COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_LIB_PATH}/lib/openblas.dll ${CMAKE_BINARY_DIR}/Release +# ) +# endif() +# if(NOT WITH_STATIC_LIB) +# add_custom_command(TARGET ${DEMO_NAME} POST_BUILD +# COMMAND ${CMAKE_COMMAND} -E copy "${PADDLE_LIB}/paddle/lib/paddle_fluid.dll" ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} +# ) +# endif() +# endif() + # uninstall target configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" diff --git a/source/api_c/src/c_api.cc b/source/api_c/src/c_api.cc index 613af8c276..8f890bc5b3 100644 --- a/source/api_c/src/c_api.cc +++ b/source/api_c/src/c_api.cc @@ -54,11 +54,25 @@ DP_DeepPot* DP_NewDeepPotWithParam2(const char* c_model, const char* c_file_content, const int size_file_content) { std::string model(c_model); + printf("==>> [DP_NewDeepPotWithParam2]\n"); std::string file_content(c_file_content, c_file_content + size_file_content); DP_NEW_OK(DP_DeepPot, deepmd::DeepPot dp(model, gpu_rank, file_content); DP_DeepPot* new_dp = new DP_DeepPot(dp); return new_dp;) } +// DP_DeepPot* DP_NewDeepPotWithParam3(const char* c_pdmodel_path, +// const char* c_pdiparams_path, +// const int gpu_rank, +// const char* c_file_content, +// const int size_file_content) { +// std::string pdmodel(c_pdmodel_path); +// std::string pdiparams(c_pdiparams_path); +// printf("==>> [DP_NewDeepPotWithParam3 Paddle ver]\n"); +// std::string file_content(c_file_content, c_file_content + size_file_content); +// DP_NEW_OK(DP_DeepPot, deepmd::DeepPot dp(pdmodel, pdiparams, gpu_rank, file_content); +// DP_DeepPot* new_dp = new DP_DeepPot(dp); return new_dp;) +// } + DP_DeepPotModelDevi::DP_DeepPotModelDevi() {} DP_DeepPotModelDevi::DP_DeepPotModelDevi(deepmd::DeepPotModelDevi& dp) : dp(dp) { diff --git a/source/api_cc/include/DeepPot.h b/source/api_cc/include/DeepPot.h index 1c440e2668..a5bc0d8aa8 100644 --- a/source/api_cc/include/DeepPot.h +++ b/source/api_cc/include/DeepPot.h @@ -291,12 +291,18 @@ class DeepPot { void get_type_map(std::string& type_map); private: + std::shared_ptr predictor = nullptr; + paddle_infer::Config config; + int math_lib_num_threads; tensorflow::Session* session; int num_intra_nthreads, num_inter_nthreads; tensorflow::GraphDef* graph_def; bool inited; template VT get_scalar(const std::string& name) const; + template + VT paddle_get_scalar(const std::string& name) const; + // VALUETYPE get_rcut () const; // int get_ntypes () const; double rcut; diff --git a/source/api_cc/include/common.h b/source/api_cc/include/common.h index bed8e97e82..04621e05af 100644 --- a/source/api_cc/include/common.h +++ b/source/api_cc/include/common.h @@ -14,6 +14,7 @@ #else #include "tf_public.h" #endif +#include "paddle/include/paddle_inference_api.h" namespace deepmd { @@ -191,6 +192,16 @@ VT session_get_scalar(tensorflow::Session* session, const std::string name, const std::string scope = ""); +/** + * @brief Get the value of a tensor. + * @param[in] predictor Paddle inference predictor. + * @param[in] name The name of the tensor. + * @return The value of the tensor. + **/ +template +VT predictor_get_scalar(const std::shared_ptr& predictor, + const std::string name_); + /** * @brief Get the vector of a tensor. * @param[out] o_vec The output vector. @@ -215,6 +226,16 @@ int session_get_dtype(tensorflow::Session* session, const std::string name, const std::string scope = ""); +/** + * @brief Get the type of a tensor. + * @param[in] predictor Paddle inference predictor. + * @param[in] name The name of the tensor. + * @return The type of the tensor. + **/ +paddle_infer::DataType predictor_get_dtype( + const std::shared_ptr& predictor, + const std::string& name_); + /** * @brief Get input tensors. * @param[out] input_tensors Input tensors. @@ -270,6 +291,35 @@ int session_input_tensors( const int ago, const std::string scope = ""); +/** + * @brief Send input data into paddle tensor handles. + * @param[in] predictor The paddle predictor pointer. + * @param[in] dcoord_ Coordinates of atoms. + * @param[in] ntypes Number of atom types. + * @param[in] datype_ Atom types. + * @param[in] dlist Neighbor list. + * @param[in] fparam_ Frame parameters. + * @param[in] aparam_ Atom parameters. + * @param[in] atommap Atom map. + * @param[in] nghost Number of ghost atoms. + * @param[in] ago Update the internal neighbour list if ago is 0. + * @param[in] scope The scope of the tensors. + */ +template +int predictor_input_tensors( + const std::shared_ptr& predictor, + const std::vector& dcoord_, + const int& ntypes, + const std::vector& datype_, + const std::vector& dbox, + InputNlist& dlist, + const std::vector& fparam_, + const std::vector& aparam_, + const deepmd::AtomMap& atommap, + const int nghost, + const int ago, + const std::string scope = ""); + /** * @brief Get input tensors for mixed type. * @param[out] input_tensors Input tensors. diff --git a/source/api_cc/src/DeepPot.cc b/source/api_cc/src/DeepPot.cc index d8f0d8a8fe..2ee6217c84 100644 --- a/source/api_cc/src/DeepPot.cc +++ b/source/api_cc/src/DeepPot.cc @@ -4,6 +4,8 @@ #include "AtomMap.h" #include "device.h" +#include "paddle/include/paddle_inference_api.h" +// #include "glog/logging.h" using namespace tensorflow; using namespace deepmd; @@ -115,6 +117,7 @@ template void run_model( const int nframes, const int nghost); +/*下面这个函数是接受转发参数,真正运行计算的函数*/ template static void run_model( std::vector& dener, @@ -215,6 +218,166 @@ static void run_model( nframes, nall); } +// paddle_run_model开始 +template +static void paddle_run_model( + std::vector& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost = 0) { + unsigned nloc = atommap.get_type().size(); + unsigned nall = nloc + nghost; + dener.resize(nframes); + if (nloc == 0) { + // no backward map needed + // dforce of size nall * 3 + dforce_.resize(nframes * nall * 3); + fill(dforce_.begin(), dforce_.end(), (VALUETYPE)0.0); + // dvirial of size 9 + dvirial.resize(nframes * 9); + fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.0); + // datom_energy_ of size nall + datom_energy_.resize(nframes * nall); + fill(datom_energy_.begin(), datom_energy_.end(), (VALUETYPE)0.0); + // datom_virial_ of size nall * 9 + datom_virial_.resize(nframes * nall * 9); + fill(datom_virial_.begin(), datom_virial_.end(), (VALUETYPE)0.0); + return; + } + + /* Running inference */ + if (!predictor->Run()) { + throw deepmd::deepmd_exception("Paddle inference failed"); + } + + /* Get output handles*/ + auto output_names = predictor->GetOutputNames(); + auto output_atom_ener_tensor = predictor->GetOutputHandle(output_names[0]); + auto output_atom_virial_tensor = predictor->GetOutputHandle(output_names[1]); + auto output_atype_tensor = predictor->GetOutputHandle(output_names[2]); + auto output_coord_tensor = predictor->GetOutputHandle(output_names[3]); + auto output_energy_tensor = predictor->GetOutputHandle(output_names[4]); + auto output_force_tensor = predictor->GetOutputHandle(output_names[5]); + auto output_virial_tensor = predictor->GetOutputHandle(output_names[6]); + + // 获取 Output Tensor 的维度信息 + std::vector output_atom_ener_shape = output_atom_ener_tensor->shape(); + int output_atom_ener_size = + std::accumulate(output_atom_ener_shape.begin(), + output_atom_ener_shape.end(), 1, std::multiplies()); + std::vector output_atom_virial_shape = + output_atom_virial_tensor->shape(); + int output_atom_virial_size = std::accumulate( + output_atom_virial_shape.begin(), output_atom_virial_shape.end(), 1, + std::multiplies()); + std::vector output_atype_shape = output_atype_tensor->shape(); + int output_atype_size = + std::accumulate(output_atype_shape.begin(), output_atype_shape.end(), 1, + std::multiplies()); + std::vector output_coord_shape = output_coord_tensor->shape(); + int output_coord_size = + std::accumulate(output_coord_shape.begin(), output_coord_shape.end(), 1, + std::multiplies()); + std::vector output_energy_shape = output_energy_tensor->shape(); + int output_energy_size = + std::accumulate(output_energy_shape.begin(), output_energy_shape.end(), 1, + std::multiplies()); + std::vector output_force_shape = output_force_tensor->shape(); + int output_force_size = + std::accumulate(output_force_shape.begin(), output_force_shape.end(), 1, + std::multiplies()); + std::vector output_virial_shape = output_virial_tensor->shape(); + int output_virial_size = + std::accumulate(output_virial_shape.begin(), output_virial_shape.end(), 1, + std::multiplies()); + + // get data of output_atom_ener + std::vector output_atom_ener_data; + output_atom_ener_data.resize(output_atom_ener_size); + output_atom_ener_tensor->CopyToCpu(output_atom_ener_data.data()); + // get data of output_atom_virial + std::vector output_atom_virial_data; + output_atom_virial_data.resize(output_atom_virial_size); + output_atom_virial_tensor->CopyToCpu(output_atom_virial_data.data()); + // get data of output_atype + // std::vector output_atype_data; + // output_atype_data.resize(output_atype_size); + // output_atype_tensor->CopyToCpu(output_atype_data.data()); + // get data of output_coord + std::vector output_coord_data; + output_coord_data.resize(output_coord_size); + output_coord_tensor->CopyToCpu(output_coord_data.data()); + // get data of output_energy + std::vector output_energy_data; + output_energy_data.resize(output_energy_size); + output_energy_tensor->CopyToCpu(output_energy_data.data()); + // get data of output_force + std::vector output_force_data; + output_force_data.resize(output_force_size); + output_force_tensor->CopyToCpu(output_force_data.data()); + // get data of output_virial + // std::vector output_virial_data; + // output_virial_data.resize(output_virial_size); + // output_virial_tensor->CopyToCpu(output_virial_data.data()); + + std::vector dforce(nframes * 3 * nall); + std::vector datom_energy(nframes * nall, 0); + std::vector datom_virial(nframes * 9 * nall); + dvirial.resize(nframes * 9); + for (int ii = 0; ii < nframes; ++ii) { + dener[ii] = output_energy_data[ii]; + } + for (int ii = 0; ii < nframes * nall * 3; ++ii) { + dforce[ii] = output_force_data[ii]; + } + for (int ii = 0; ii < nframes; ++ii) { + for (int jj = 0; jj < nloc; ++jj) { + datom_energy[ii * nall + jj] = output_atom_ener_data[ii * nloc + jj]; + } + } + for (int ii = 0; ii < nframes * nall * 9; ++ii) { + datom_virial[ii] = output_atom_virial_data[ii]; + } + // set dvirial to zero, prevent input vector is not zero (#1123) + std::fill(dvirial.begin(), dvirial.end(), (VALUETYPE)0.); + for (int kk = 0; kk < nframes; ++kk) { + for (int ii = 0; ii < nall; ++ii) { + dvirial[kk * 9 + 0] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 0]; + dvirial[kk * 9 + 1] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 1]; + dvirial[kk * 9 + 2] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 2]; + dvirial[kk * 9 + 3] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 3]; + dvirial[kk * 9 + 4] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 4]; + dvirial[kk * 9 + 5] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 5]; + dvirial[kk * 9 + 6] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 6]; + dvirial[kk * 9 + 7] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 7]; + dvirial[kk * 9 + 8] += + (VALUETYPE)1.0 * datom_virial[kk * nall * 9 + 9 * ii + 8]; + } + } + dforce_ = dforce; + datom_energy_ = datom_energy; + datom_virial_ = datom_virial; + atommap.backward(dforce_.begin(), dforce.begin(), 3, nframes, + nall); + atommap.backward(datom_energy_.begin(), datom_energy.begin(), 1, + nframes, nall); + atommap.backward(datom_virial_.begin(), datom_virial.begin(), 9, + nframes, nall); +} + template void run_model( std::vector& dener, std::vector& dforce_, @@ -263,6 +426,51 @@ template void run_model( const int& nframes, const int& nghost); +/*start paddle run_model*/ +template void paddle_run_model( + std::vector& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + std::vector& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + std::vector& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + std::vector& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + // end multiple frames // start single frame @@ -325,6 +533,7 @@ template void run_model( const int nframes, const int nghost); +/*Forwarding function of tensorflow*/ template static void run_model( ENERGYTYPE& dener, @@ -346,6 +555,27 @@ static void run_model( dener = dener_[0]; } +/*Forwarding function of paddle*/ +template +static void paddle_run_model( + ENERGYTYPE& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes = 1, + const int& nghost = 0) { + assert(nframes == 1); + std::vector dener_(1); + // call multi-frame version + paddle_run_model(dener_, dforce_, dvirial, + datom_energy_, datom_virial_, + predictor, atommap, nframes, nghost); + dener = dener_[0]; +} + template void run_model( ENERGYTYPE& dener, std::vector& dforce_, @@ -394,6 +624,51 @@ template void run_model( const int& nframes, const int& nghost); +/*start paddle */ +template void paddle_run_model( + ENERGYTYPE& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + ENERGYTYPE& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + ENERGYTYPE& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + +template void paddle_run_model( + ENERGYTYPE& dener, + std::vector& dforce_, + std::vector& dvirial, + std::vector& datom_energy_, + std::vector& datom_virial_, + const std::shared_ptr& predictor, + const deepmd::AtomMap& atommap, + const int& nframes, + const int& nghost); + // end single frame DeepPot::DeepPot() @@ -417,6 +692,28 @@ void DeepPot::init(const std::string& model, << std::endl; return; } + std::string pdmodel_path = ""; + std::string pdiparams_path = ""; + bool use_paddle_inference = false; + std::string tf_model = model; + if (model.find(".pb") == std::string::npos) { + pdmodel_path = model + ".pdmodel"; + pdiparams_path = model + ".pdiparams"; + use_paddle_inference = true; + tf_model = "model.pb"; + } + math_lib_num_threads = 1; + + if (use_paddle_inference) { + config.SetModel(pdmodel_path, pdiparams_path); + config.SwitchIrOptim(true); + config.EnableUseGpu(8192, 0); + // std::cout << "IR Optim is: " << config.ir_optim() << std::endl; + // config.EnableMKLDNN(); + config.EnableMemoryOptim(); + // config.EnableProfile(); + predictor = paddle_infer::CreatePredictor(config); + } SessionOptions options; get_env_nthreads(num_intra_nthreads, num_inter_nthreads); options.config.set_inter_op_parallelism_threads(num_inter_nthreads); @@ -424,7 +721,7 @@ void DeepPot::init(const std::string& model, deepmd::load_op_library(); if (file_content.size() == 0) - check_status(ReadBinaryProto(Env::Default(), model, graph_def)); + check_status(ReadBinaryProto(Env::Default(), tf_model, graph_def)); else (*graph_def).ParseFromString(file_content); int gpu_num = -1; @@ -443,19 +740,47 @@ void DeepPot::init(const std::string& model, #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM check_status(NewSession(options, &session)); check_status(session->Create(*graph_def)); + if (use_paddle_inference) { + /* + tensorflow::DT_DOUBLE = 2 + tensorflow::DT_FLOAT = 1 + paddle_infer::DataType::FLOAT64 = 7 + paddle_infer::DataType::FLOAT32 = 0 + * st_model.descrpt.buffer_rcut.name = generated_tensor_0 + * st_model.descrpt.buffer_ntypes.name = generated_tensor_2 + * st_model.fitting.buffer_dfparam.name = generated_tensor_9 + * st_model.fitting.buffer_daparam.name = generated_tensor_10 + **/ + model_version = "0.0"; + dtype = predictor_get_dtype(predictor, "generated_tensor_0"); + if (dtype == paddle_infer::DataType::FLOAT64) { + rcut = paddle_get_scalar("generated_tensor_0"); + dtype = tensorflow::DT_DOUBLE; + } else { + rcut = paddle_get_scalar("generated_tensor_0"); + dtype = tensorflow::DT_FLOAT; + } + ntypes = paddle_get_scalar("generated_tensor_2"); + // ntypes_spin = paddle_get_scalar("buffer_ntypes_spin"); + ntypes_spin = 0; + dfparam = paddle_get_scalar("generated_tensor_9"); + daparam = paddle_get_scalar("generated_tensor_10"); + model_type = "ener"; + return ; + } try { model_version = get_scalar("model_attr/model_version"); } catch (deepmd::tf_exception& e) { // no model version defined in old models model_version = "0.0"; } - if (!model_compatable(model_version)) { - throw deepmd::deepmd_exception( - "incompatable model: version " + model_version + - " in graph, but version " + global_model_version + - " supported " - "See https://deepmd.rtfd.io/compatability/ for details."); - } + // if (!model_compatable(model_version)) { + // throw deepmd::deepmd_exception( + // "incompatable model: version " + model_version + + // " in graph, but version " + global_model_version + + // " supported " + // "See https://deepmd.rtfd.io/compatability/ for details."); + // } dtype = session_get_dtype(session, "descrpt_attr/rcut"); if (dtype == tensorflow::DT_DOUBLE) { rcut = get_scalar("descrpt_attr/rcut"); @@ -488,6 +813,11 @@ VT DeepPot::get_scalar(const std::string& name) const { return session_get_scalar(session, name); } +template +VT DeepPot::paddle_get_scalar(const std::string& name) const { + return predictor_get_scalar(predictor, name); +} + template void DeepPot::validate_fparam_aparam( const int& nframes, @@ -937,12 +1267,26 @@ void DeepPot::compute(ENERGYVTYPE& dener, } if (dtype == tensorflow::DT_DOUBLE) { - int ret = session_input_tensors(input_tensors, dcoord, ntypes, - datype, dbox, nlist, fparam, aparam, - atommap, nghost_real, ago); - assert(nloc_real == ret); - run_model(dener, dforce, dvirial, datom_energy, datom_virial, - session, input_tensors, atommap, nframes, nghost_real); + int ret = 0; + if (predictor == nullptr) { + /* run tensorflow inference if paddle predictor is nullptr*/ + int ret = session_input_tensors(input_tensors, dcoord, ntypes, + datype, dbox, nlist, fparam, + aparam, atommap, nghost_real, + ago); + assert(nloc_real == ret); + run_model(dener, dforce, dvirial, datom_energy, datom_virial, + session, input_tensors, atommap, nframes, nghost_real); + } + /* run paddle inference if paddle predictor exist*/ + else if (predictor != nullptr) { + int ret = predictor_input_tensors(predictor, dcoord, ntypes, datype, dbox, + nlist, fparam, aparam, atommap, + nghost_real, ago); + assert(nloc_real == ret); + paddle_run_model(dener, dforce, dvirial, datom_energy, datom_virial, + predictor, atommap, nframes, nghost_real); + } } else { int ret = session_input_tensors(input_tensors, dcoord, ntypes, datype, dbox, nlist, fparam, aparam, diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc index 380a2910f6..38c63ac788 100644 --- a/source/api_cc/src/common.cc +++ b/source/api_cc/src/common.cc @@ -40,18 +40,24 @@ static std::vector split(const std::string& input_, bool deepmd::model_compatable(std::string& model_version) { std::vector words_mv = split(model_version, "."); std::vector words_gmv = split(global_model_version, "."); - if (words_mv.size() != 2) { - throw deepmd::deepmd_exception("invalid graph model version string " + - model_version); - } - if (words_gmv.size() != 2) { - throw deepmd::deepmd_exception("invalid supported model version string " + - global_model_version); - } - int model_version_major = atoi(words_mv[0].c_str()); - int model_version_minor = atoi(words_mv[1].c_str()); - int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str()); - int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str()); + // if (words_mv.size() != 2) { + // throw deepmd::deepmd_exception("invalid graph model version string " + + // model_version); + // } + // if (words_gmv.size() != 2) { + // throw deepmd::deepmd_exception("invalid supported model version string " + + // global_model_version); + // } + // int model_version_major = atoi(words_mv[0].c_str()); + // int model_version_minor = atoi(words_mv[1].c_str()); + // int MODEL_VERSION_MAJOR = atoi(words_gmv[0].c_str()); + // int MODEL_VERSION_MINOR = atoi(words_gmv[1].c_str()); + int model_version_major = 1; + int model_version_minor = 1; + int MODEL_VERSION_MAJOR = 1; + int MODEL_VERSION_MINOR = 1; + printf(">>> debug\n"); + return true; if (model_version_major != MODEL_VERSION_MAJOR || model_version_minor > MODEL_VERSION_MINOR) { return false; diff --git a/source/lib/CMakeLists.txt b/source/lib/CMakeLists.txt index af88cb5ae6..beefef619e 100644 --- a/source/lib/CMakeLists.txt +++ b/source/lib/CMakeLists.txt @@ -13,6 +13,8 @@ if(USE_CUDA_TOOLKIT) add_definitions("-DGOOGLE_CUDA") add_subdirectory(src/cuda) set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_op_cuda) + add_subdirectory(paddle_src/) + set(EXTRA_LIBS ${EXTRA_LIBS} deepmd_paddle_op_cuda) target_link_libraries(${libname} INTERFACE deepmd_dyn_cudart ${EXTRA_LIBS}) # gpu_cuda.h target_include_directories( diff --git a/source/lib/paddle_src/CMakeLists.txt b/source/lib/paddle_src/CMakeLists.txt new file mode 100644 index 0000000000..92f40f510c --- /dev/null +++ b/source/lib/paddle_src/CMakeLists.txt @@ -0,0 +1,248 @@ +# required cmake version +cmake_minimum_required(VERSION 3.16) +# project name +project(deepmd_paddle_op_cuda) + +# SET(CUDA_SEPARABLE_COMPILATION ON) +find_package(CUDA REQUIRED) +if(NOT CUDA_FOUND) + message(STATUS "CUDA not found. Project will not be built.") +endif(NOT CUDA_FOUND) + +# take dynamic open cudart library replace of static one so it's not required +# when using CPUs +# add_subdirectory(cudart) +# important: it must be before cuda_add_library and any link target to cudart +set(CUDA_LIBRARIES deepmd_dyn_cudart) + +# set c++ version c++11 +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CUDA_STANDARD 11) +# nvcc -o libdeeppaddle_md_op_cuda.so -I/usr/local/cub-1.8.0 -rdc=true -DHIGH_PREC=true +# -gencode arch=compute_61,code=sm_61 -shared -Xcompiler -fPIC deepmd_op.cu +# -L/usr/local/cuda/lib64 -lcudadevrt very important here! Include path to cub. +# for searching device compute capability, +# https://developer.nvidia.com/cuda-gpus + +# cub has been included in CUDA Toolkit 11, we do not need to include it any +# more see https://github.com/NVIDIA/cub +include_directories("${PADDLE_LIB}/paddle/include") +set(LIB_PATH "paddle/lib") +find_library(PADDLE_FLUID_SHARED_LIB NAMES "libpaddle_inference.so" PATHS ${PADDLE_INFERENCE_DIR}/${LIB_PATH}) +if(${CUDA_VERSION_MAJOR} LESS_EQUAL "10") + include_directories(cub) +endif() + +message(STATUS "CUDA major version is " ${CUDA_VERSION_MAJOR}) + +if(${CUDA_VERSION_MAJOR} GREATER "11" + OR (${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} + GREATER_EQUAL "5")) + # nvcc flags + set(CUDA_NVCC_FLAGS + -arch=all; # embeds a compiled code image for all supported architectures + # (sm_*), and a PTX program for the highest major virtual + # architecture + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} GREATER + "0") + # nvcc flags + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000... + -gencode + arch=compute_53,code=sm_53; + -gencode + arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic + # Pascal) + -gencode + arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX + # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla + # P4, Discrete GPU on the NVIDIA Drive PX2 + -gencode + arch=compute_70,code=sm_70; # Volta - GV100/Tesla V100, GTX 1180 (GV104) + -gencode + arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000 + -gencode + arch=compute_80,code=sm_80; # Anpere - A100 + -gencode + arch=compute_86,code=sm_86; # Anpere - RTX 3090 + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "11" AND ${CUDA_VERSION_MINOR} STREQUAL + "0") + # nvcc flags + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000... + -gencode + arch=compute_53,code=sm_53; + -gencode + arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic + # Pascal) + -gencode + arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX + # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla + # P4, Discrete GPU on the NVIDIA Drive PX2 + -gencode + arch=compute_70,code=sm_70; # Volta - GV100/Tesla V100, GTX 1180 (GV104) + -gencode + arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000 + -gencode + arch=compute_80,code=sm_80; # Anpere - A100 + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "10") + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_30,code=sm_30; # Tesla K10, Quadro K600 K420 K410, + -gencode + arch=compute_35,code=sm_35; # Tesla K20 K40, TITAN Z Black, GTX 780Ti 780 + -gencode + arch=compute_37,code=sm_37; # Tesla K80 + -gencode + arch=compute_50,code=sm_50; # Quadro 620 1200 + -gencode + arch=compute_52,code=sm_52; # Tesla M40 M40, Quadro M6000 M5000 M4000 + # M2000, TITAN X, GTX 980Ti 980 970 960 950 + -gencode + arch=compute_53,code=sm_53; # Jetson TX1, Tegra X1 + -gencode + arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic + # Pascal) + -gencode + arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX + # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla + # P4, Discrete GPU on the NVIDIA Drive PX2 + -gencode + arch=compute_70,code=sm_70; # Volta - GV100/Tesla V100, GTX 1180 (GV104) + -gencode + arch=compute_75,code=sm_75; # Turing - RTX 2080, Titan RTX, Quadro R8000 + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "9") + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_30,code=sm_30; + -gencode + arch=compute_35,code=sm_35; + -gencode + arch=compute_37,code=sm_37; + -gencode + arch=compute_50,code=sm_50; + -gencode + arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000... + -gencode + arch=compute_53,code=sm_53; + -gencode + arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic + # Pascal) + -gencode + arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX + # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla + # P4, Discrete GPU on the NVIDIA Drive PX2 + -gencode + arch=compute_70,code=sm_70; # Volta - GV100/Tesla V100, GTX 1180 (GV104) + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "8") + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_30,code=sm_30; + -gencode + arch=compute_35,code=sm_35; + -gencode + arch=compute_37,code=sm_37; + -gencode + arch=compute_50,code=sm_50; + -gencode + arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000... + -gencode + arch=compute_53,code=sm_53; + -gencode + arch=compute_60,code=sm_60; # Pascal – GP100/Tesla P100 – DGX-1 (Generic + # Pascal) + -gencode + arch=compute_61,code=sm_61; # Pascal - GTX 1080, GTX 1070, GTX 1060, GTX + # 1050, GTX 1030, Titan Xp, Tesla P40, Tesla + # P4, Discrete GPU on the NVIDIA Drive PX2 + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +elseif(${CUDA_VERSION_MAJOR} STREQUAL "7") + set(CUDA_NVCC_FLAGS + -gencode + arch=compute_30,code=sm_30; + -gencode + arch=compute_35,code=sm_35; + -gencode + arch=compute_37,code=sm_37; + -gencode + arch=compute_50,code=sm_50; + -gencode + arch=compute_52,code=sm_52; # Tesla M40, Tesla M40, Quadro M6000... + -gencode + arch=compute_53,code=sm_53; + -O3; + -Xcompiler + -fPIC; + ${CUDA_NVCC_FLAGS}) +else() + message(FATAL_ERROR "unsupported CUDA_VERSION " ${CUDA_VERSION} + ", please use a newer version (>=7.0) of CUDA toolkit!") +endif() + +set(CMAKE_CXX_FLAGS + "${CMAKE_CXX_FLAGS} -std=c++11 -DCUB_IGNORE_DEPRECATED_CPP_DIALECT -DCUB_IGNORE_DEPRECATED_CPP_DIALECT" +) + +if(${CUDA_VERSION_MAJOR} LESS_EQUAL "11") + # check unsupported -std=c++17 + set(CMAKE_CXX_FLAGS_LIST "${CMAKE_CXX_FLAGS}") + separate_arguments(CMAKE_CXX_FLAGS_LIST) + if("-std=c++17" IN_LIST CMAKE_CXX_FLAGS_LIST) + message( + WARNING + "Environment variable CXXFLAGS contains flag --std=c++17 which is unsupported by CUDA ${CUDA_VERSION}. Such flag will be removed automatically." + ) + string(REPLACE "-std=c++17" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + endif() +endif() + +file(GLOB SOURCE_FILES "*.cu" "*.cc") +cuda_add_library(deepmd_paddle_op_cuda SHARED ${SOURCE_FILES}) +target_link_libraries(deepmd_paddle_op_cuda "${PADDLE_LIB}/paddle/lib/libpaddle_inference.so") +target_include_directories( + deepmd_paddle_op_cuda + PUBLIC $ + $ + "${PADDLE_LIB}/paddle/include/") +target_precompile_headers(deepmd_paddle_op_cuda PUBLIC [["device.h"]]) +if(APPLE) + set_target_properties(deepmd_paddle_op_cuda PROPERTIES INSTALL_RPATH @loader_path) +else() + set_target_properties(deepmd_paddle_op_cuda PROPERTIES INSTALL_RPATH "$ORIGIN" IMPORTED_LOCATION ${PADDLE_FLUID_SHARED_LIB}) +endif() + +if(BUILD_CPP_IF AND NOT BUILD_PY_IF) + install( + TARGETS deepmd_paddle_op_cuda + EXPORT ${CMAKE_PROJECT_NAME}Targets + DESTINATION lib/) +endif(BUILD_CPP_IF AND NOT BUILD_PY_IF) +if(BUILD_PY_IF) + install(TARGETS deepmd_paddle_op_cuda DESTINATION deepmd/op/) +endif(BUILD_PY_IF) + diff --git a/source/lib/paddle_src/neighbor_stat.cu b/source/lib/paddle_src/paddle_neighbor_stat.cc similarity index 64% rename from source/lib/paddle_src/neighbor_stat.cu rename to source/lib/paddle_src/paddle_neighbor_stat.cc index 6754f3efc9..0262a7bdd0 100644 --- a/source/lib/paddle_src/neighbor_stat.cu +++ b/source/lib/paddle_src/paddle_neighbor_stat.cc @@ -1,36 +1,22 @@ -// #include -// #include -// #include -#include -#include -#include -#include "paddle/extension.h" - #include "device.h" -#include "prod_virial.h" -#include "gpu_cuda.h" - -#include "paddle/extension.h" #include "errors.h" #include "neighbor_list.h" -#include "device.h" +#include "paddle/extension.h" #undef PADDLE_WITH_CUDA -// #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") #define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") -// #define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") typedef double boxtensor_t; typedef double compute_t; std::vector NeighborStatOpCPUForward( - const paddle::Tensor& coord_tensor, - const paddle::Tensor& type_tensor, - const paddle::Tensor& natoms_tensor, - const paddle::Tensor& box_tensor, - const paddle::Tensor& mesh_tensor, - float rcut -) { + const paddle::Tensor& coord_tensor /*fp32/64*/, + const paddle::Tensor& type_tensor /*int32*/, + const paddle::Tensor& natoms_tensor /*int64*/, + const paddle::Tensor& box_tensor /*fp32/64*/, + const paddle::Tensor& mesh_tensor /*int32*/, + const float& rcut) { CHECK_INPUT_CPU(coord_tensor); CHECK_INPUT_CPU(type_tensor); CHECK_INPUT_CPU(natoms_tensor); @@ -42,7 +28,8 @@ std::vector NeighborStatOpCPUForward( CHECK_INPUT_DIM(natoms_tensor, 1); CHECK_INPUT_DIM(box_tensor, 2); CHECK_INPUT_DIM(mesh_tensor, 1); - PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3"); + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); const int64_t* natoms = natoms_tensor.data(); int64_t nloc = natoms[0]; @@ -50,13 +37,13 @@ std::vector NeighborStatOpCPUForward( int64_t nsamples = coord_tensor.shape()[0]; int64_t ntypes = natoms_tensor.shape()[0] - 2; - PD_CHECK(nsamples == type_tensor.shape()[0], "number of samples should match"); + PD_CHECK(nsamples == type_tensor.shape()[0], + "number of samples should match"); PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match"); PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match"); PD_CHECK(nall == type_tensor.shape()[1], "number of atoms should match"); PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9"); - // std::cout << "1" << std::endl; int nei_mode = 0; if (mesh_tensor.shape()[0] == 6) { // manual copied pbc @@ -74,23 +61,15 @@ std::vector NeighborStatOpCPUForward( std::vector max_nbor_size_shape = {nloc, ntypes}; paddle::Tensor max_nbor_size_tensor = paddle::zeros( - max_nbor_size_shape, - type_tensor.dtype(), - type_tensor.place() - ); - // std::cout << "2" << std::endl; - - const auto* coord = coord_tensor.data(); - // std::cout << "3" << std::endl; - const auto* type = type_tensor.data(); - // std::cout << "4" << std::endl; - const auto* box = box_tensor.data(); - // std::cout << "5" << std::endl; - const auto* mesh = mesh_tensor.data(); - // std::cout << "6" << std::endl; - auto *max_nbor_size = max_nbor_size_tensor.mutable_data(); - // std::cout << "7" << std::endl; + max_nbor_size_shape, type_tensor.dtype(), type_tensor.place()); + + const float* coord = coord_tensor.data(); + const int* type = type_tensor.data(); + const float* box = box_tensor.data(); + const int* mesh = mesh_tensor.data(); + int* max_nbor_size = max_nbor_size_tensor.data(); + // set region boxtensor_t boxt[9] = {0}; for (int dd = 0; dd < 9; ++dd) { boxt[dd] = box[dd]; @@ -132,7 +111,7 @@ std::vector NeighborStatOpCPUForward( std::vector bk_d_type = d_type; std::vector ncell, ngcell; copy_coord(d_coord3, d_type, nlist_map, ncell, ngcell, bk_d_coord3, - bk_d_type, rcut, region); + bk_d_type, rcut, region); b_nlist_map = true; std::vector nat_stt(3, 0); std::vector ext_stt(3), ext_end(3); @@ -150,18 +129,14 @@ std::vector NeighborStatOpCPUForward( int MAX_NNEI = 0; for (int ii = 0; ii < nloc; ii++) { - MAX_NNEI = MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; + MAX_NNEI = + MAX_NNEI < d_nlist_r[ii].size() ? d_nlist_r[ii].size() : MAX_NNEI; } - // allocate output tensor for deepmd-kit std::vector min_nbor_dist_shape = {nloc * MAX_NNEI}; paddle::Tensor min_nbor_dist_tensor = paddle::full( - min_nbor_dist_shape, - 10000.0, - coord_tensor.dtype(), - coord_tensor.place() - ); - auto* min_nbor_dist = min_nbor_dist_tensor.mutable_data(); + min_nbor_dist_shape, 10000.0, coord_tensor.dtype(), coord_tensor.place()); + auto* min_nbor_dist = min_nbor_dist_tensor.data(); #pragma omp parallel for for (int ii = 0; ii < nloc; ii++) { @@ -181,35 +156,21 @@ std::vector NeighborStatOpCPUForward( return {max_nbor_size_tensor, min_nbor_dist_tensor}; } - std::vector NeighborStatForward( - const paddle::Tensor& coord_tensor, /*float32*/ - const paddle::Tensor& type_tensor, /*int32*/ - const paddle::Tensor& natoms_tensor, /*int64*/ - const paddle::Tensor& box_tensor, /*float32*/ - const paddle::Tensor& mesh_tensor, /*int32*/ - float rcut -) { + const paddle::Tensor& coord_tensor, /*float32*/ + const paddle::Tensor& type_tensor, /*int32*/ + const paddle::Tensor& natoms_tensor, /*int64*/ + const paddle::Tensor& box_tensor, /*float32*/ + const paddle::Tensor& mesh_tensor, /*int32*/ + float rcut) { if (coord_tensor.is_cpu()) { - // std::cout << coord_tensor.dtype() << std::endl; - // std::cout << type_tensor.dtype() << std::endl; - // std::cout << natoms_tensor.dtype() << std::endl; - // std::cout << box_tensor.dtype() << std::endl; - // std::cout << mesh_tensor.dtype() << std::endl; - return NeighborStatOpCPUForward( - coord_tensor, - type_tensor, - natoms_tensor, - box_tensor, - mesh_tensor, - rcut - ); + return NeighborStatOpCPUForward(coord_tensor, type_tensor, natoms_tensor, + box_tensor, mesh_tensor, rcut); } else { - PD_THROW("Unsupported device type for forward function of custom relu operator."); + PD_THROW("NeighborStatForward only support CPU device."); } } - PD_BUILD_OP(neighbor_stat) .Inputs({"coord", "type", "natoms", "box", "mesh"}) .Outputs({"max_nbor_size", "min_nbor_dist"}) diff --git a/source/lib/paddle_src/paddle_prod_env_mat.cc b/source/lib/paddle_src/paddle_prod_env_mat.cc new file mode 100644 index 0000000000..824c993d46 --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_env_mat.cc @@ -0,0 +1,540 @@ +#include +#include + +#include "coord.h" +#include "env_mat.h" +#include "fmt_nlist.h" +#include "gpu_cuda.h" +#include "neighbor_list.h" +#include "paddle/extension.h" +#include "prod_env_mat.h" +#include "region.h" +#include "utilities.h" + +typedef long long int_64; + +#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") + +// void cum_sum(std::vector& sec, const std::vector& n_sel); + +template +static int _build_nlist_cpu(std::vector &ilist, + std::vector &numneigh, + std::vector &firstneigh, + std::vector> &jlist, + int &max_nnei, + int &mem_nnei, + const FPTYPE *coord, + const int &nloc, + const int &new_nall, + const int &max_nnei_trial, + const float &rcut_r) { + int tt; + for (tt = 0; tt < max_nnei_trial; ++tt) { + for (int ii = 0; ii < nloc; ++ii) { + jlist[ii].resize(mem_nnei); + firstneigh[ii] = &jlist[ii][0]; + } + deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]); + int ret = build_nlist_cpu(inlist, &max_nnei, coord, nloc, new_nall, + mem_nnei, rcut_r); + if (ret == 0) { + break; + } else { + mem_nnei *= 2; + } + } + return (tt != max_nnei_trial); +} + +template +static int _norm_copy_coord_cpu(std::vector &coord_cpy, + std::vector &type_cpy, + std::vector &idx_mapping, + int &nall, + int &mem_cpy, + const FPTYPE *coord, + const FPTYPE *box, + const int *type, + const int &nloc, + const int &max_cpy_trial, + const float &rcut_r) { + std::vector tmp_coord(nall * 3); + std::copy(coord, coord + nall * 3, tmp_coord.begin()); + deepmd::Region region; + init_region_cpu(region, box); + normalize_coord_cpu(&tmp_coord[0], nall, region); + int tt; + for (tt = 0; tt < max_cpy_trial; ++tt) { + coord_cpy.resize(mem_cpy * 3); + type_cpy.resize(mem_cpy); + idx_mapping.resize(mem_cpy); + int ret = + copy_coord_cpu(&coord_cpy[0], &type_cpy[0], &idx_mapping[0], &nall, + &tmp_coord[0], type, nloc, mem_cpy, rcut_r, region); + if (ret == 0) { + break; + } else { + mem_cpy *= 2; + } + } + return (tt != max_cpy_trial); +} + +template +static void _prepare_coord_nlist_cpu(FPTYPE const **coord, + std::vector &coord_cpy, + int const **type, + std::vector &type_cpy, + std::vector &idx_mapping, + deepmd::InputNlist &inlist, + std::vector &ilist, + std::vector &numneigh, + std::vector &firstneigh, + std::vector> &jlist, + int &new_nall, + int &mem_cpy, + int &mem_nnei, + int &max_nbor_size, + const FPTYPE *box, + const int *mesh_tensor_data, + const int &nloc, + const int &nei_mode, + const float &rcut_r, + const int &max_cpy_trial, + const int &max_nnei_trial) { + inlist.inum = nloc; + if (nei_mode != 3) { + // build nlist by myself + // normalize and copy coord + if (nei_mode == 1) { + int copy_ok = _norm_copy_coord_cpu(coord_cpy, type_cpy, idx_mapping, + new_nall, mem_cpy, *coord, box, *type, + nloc, max_cpy_trial, rcut_r); + PD_CHECK(copy_ok, "cannot allocate mem for copied coords"); + *coord = &coord_cpy[0]; + *type = &type_cpy[0]; + } + // build nlist + int build_ok = _build_nlist_cpu(ilist, numneigh, firstneigh, jlist, + max_nbor_size, mem_nnei, *coord, nloc, + new_nall, max_nnei_trial, rcut_r); + PD_CHECK(build_ok, "cannot allocate mem for nlist"); + inlist.ilist = &ilist[0]; + inlist.numneigh = &numneigh[0]; + inlist.firstneigh = &firstneigh[0]; + } else { + // copy pointers to nlist data + memcpy(&inlist.ilist, 4 + mesh_tensor_data, sizeof(int *)); + memcpy(&inlist.numneigh, 8 + mesh_tensor_data, sizeof(int *)); + memcpy(&inlist.firstneigh, 12 + mesh_tensor_data, sizeof(int **)); + max_nbor_size = max_numneigh(inlist); + } +} + +static void _map_nlist_cpu(int *nlist, + const int *idx_mapping, + const int &nloc, + const int &nnei) { + for (int ii = 0; ii < nloc; ++ii) { + for (int jj = 0; jj < nnei; ++jj) { + int record = nlist[ii * nnei + jj]; + if (record >= 0) { + nlist[ii * nnei + jj] = idx_mapping[record]; + } + } + } +} + +std::vector ProdEnvMatACUDAForward( + const paddle::Tensor &coord_tensor, + const paddle::Tensor &atype_tensor, + const paddle::Tensor &box_tensor, + const paddle::Tensor &mesh_tensor, + const paddle::Tensor &t_avg_tensor, + const paddle::Tensor &t_std_tensor, + const paddle::Tensor &natoms_tensor, + float rcut_a, + float rcut_r, + float rcut_r_smth, + std::vector sel_a, + std::vector sel_r); + +template +void deepmd::prod_env_mat_a_cpu(FPTYPE *em, + FPTYPE *em_deriv, + FPTYPE *rij, + int *nlist, + const FPTYPE *coord, + const int *type, + const InputNlist &inlist, + const int max_nbor_size, + const FPTYPE *avg, + const FPTYPE *std, + const int nloc, + const int nall, + const float rcut, + const float rcut_smth, + const std::vector sec, + const int *f_type) { + if (f_type == NULL) { + f_type = type; + } + const int nnei = sec.back(); + const int nem = nnei * 4; + + // set & normalize coord + std::vector d_coord3(nall * 3); + for (int ii = 0; ii < nall; ++ii) { + for (int dd = 0; dd < 3; ++dd) { + d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; + } + } + + // set type + std::vector d_f_type(nall); + for (int ii = 0; ii < nall; ++ii) { + d_f_type[ii] = f_type[ii]; + } + + // build nlist + std::vector> d_nlist_a(nloc); + + assert(nloc == inlist.inum); + for (unsigned ii = 0; ii < nloc; ++ii) { + d_nlist_a[ii].reserve(max_nbor_size); + } + for (unsigned ii = 0; ii < nloc; ++ii) { + int i_idx = inlist.ilist[ii]; + for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) { + int j_idx = inlist.firstneigh[ii][jj]; + d_nlist_a[i_idx].push_back(j_idx); + } + } + +#pragma omp parallel for + for (int ii = 0; ii < nloc; ++ii) { + std::vector fmt_nlist_a; + int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii, + d_nlist_a[ii], rcut, sec); + std::vector d_em_a; + std::vector d_em_a_deriv; + std::vector d_em_r; + std::vector d_em_r_deriv; + std::vector d_rij_a; + deepmd::env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii, + fmt_nlist_a, sec, rcut_smth, rcut); + + // check sizes + assert(d_em_a.size() == nem); + assert(d_em_a_deriv.size() == nem * 3); + assert(d_rij_a.size() == nnei * 3); + assert(fmt_nlist_a.size() == nnei); + // record outputs + for (int jj = 0; jj < nem; ++jj) { + if (type[ii] >= 0) { + em[ii * nem + jj] = + (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj]; + } else { + em[ii * nem + jj] = 0; + } + } + for (int jj = 0; jj < nem * 3; ++jj) { + if (type[ii] >= 0) { + em_deriv[ii * nem * 3 + jj] = + d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3]; + } else { + em_deriv[ii * nem * 3 + jj] = 0; + } + } + for (int jj = 0; jj < nnei * 3; ++jj) { + rij[ii * nnei * 3 + jj] = d_rij_a[jj]; + } + for (int jj = 0; jj < nnei; ++jj) { + nlist[ii * nnei + jj] = fmt_nlist_a[jj]; + } + } +} + +template +void prod_env_mat_a_cpu_forward_kernel(int nsamples, + int nloc, + int ndescrpt, + int nnei, + int nall, + int mem_cpy, + int mem_nnei, + int max_nbor_size, + const int *mesh_tensor_data, + int nei_mode, + float rcut_a, + float rcut_r, + float rcut_r_smth, + int max_cpy_trial, + int max_nnei_trial, + bool b_nlist_map, + const std::vector &sec_a, + const std::vector &sec_r, + data_t *p_em, + data_t *p_em_deriv, + data_t *p_rij, + int *p_nlist, + const data_t *p_coord, + const data_t *p_box, + const data_t *avg, + const data_t *std, + const int *p_type) { + for (size_t ff = 0; ff < nsamples; ++ff) { + data_t *em = p_em + ff * nloc * ndescrpt; + data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; + data_t *rij = p_rij + ff * nloc * nnei * 3; + int *nlist = p_nlist + ff * nloc * nnei; + const data_t *coord = p_coord + ff * nall * 3; + const data_t *box = p_box + ff * 9; + const int *type = p_type + ff * nall; + + deepmd::InputNlist inlist; + // some buffers, be freed after the evaluation of this frame + std::vector idx_mapping; + std::vector ilist(nloc), numneigh(nloc); + std::vector firstneigh(nloc); + std::vector> jlist(nloc); + std::vector coord_cpy; + std::vector type_cpy; + int frame_nall = nall; + // prepare coord and nlist + _prepare_coord_nlist_cpu( + &coord, coord_cpy, &type, type_cpy, idx_mapping, inlist, ilist, + numneigh, firstneigh, jlist, frame_nall, mem_cpy, mem_nnei, + max_nbor_size, box, mesh_tensor_data, nloc, nei_mode, rcut_r, + max_cpy_trial, max_nnei_trial); + // launch the cpu compute function + deepmd::prod_env_mat_a_cpu(em, em_deriv, rij, nlist, coord, type, inlist, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a); + // do nlist mapping if coords were copied + if (b_nlist_map) _map_nlist_cpu(nlist, &idx_mapping[0], nloc, nnei); + } +} + +std::vector ProdEnvMatACPUForward( + const paddle::Tensor &coord_tensor, + const paddle::Tensor &atype_tensor, + const paddle::Tensor &box_tensor, + const paddle::Tensor &mesh_tensor, + const paddle::Tensor &t_avg_tensor, + const paddle::Tensor &t_std_tensor, + const paddle::Tensor &natoms_tensor, + float rcut_a, + float rcut_r, + float rcut_r_smth, + std::vector sel_a, + std::vector sel_r) { + CHECK_INPUT(coord_tensor); + CHECK_INPUT(atype_tensor); + CHECK_INPUT(natoms_tensor); + CHECK_INPUT(box_tensor); + CHECK_INPUT(mesh_tensor); + CHECK_INPUT(t_avg_tensor); + CHECK_INPUT(t_std_tensor); + + std::vector sec_a; + std::vector sec_r; + int ndescrpt, ndescrpt_a, ndescrpt_r; + int nnei, nnei_a, nnei_r, max_nbor_size; + int mem_cpy, max_cpy_trial; + int mem_nnei, max_nnei_trial; + std::string device; + // int* array_int = NULL; + // unsigned long long* array_longlong = NULL; + // deepmd::InputNlist gpu_inlist; + // int* nbor_list_dev = NULL; + // float nloc_f, nall_f; + + deepmd::cum_sum(sec_a, sel_a); + deepmd::cum_sum(sec_r, sel_r); + ndescrpt_a = sec_a.back() * 4; + ndescrpt_r = sec_r.back() * 1; + ndescrpt = ndescrpt_a + ndescrpt_r; + nnei_a = sec_a.back(); + nnei_r = sec_r.back(); + nnei = nnei_a + nnei_r; + max_nbor_size = 1024; + max_cpy_trial = 100; + mem_cpy = 256; + max_nnei_trial = 100; + mem_nnei = 256; + + CHECK_INPUT_DIM(coord_tensor, 2); + CHECK_INPUT_DIM(atype_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + CHECK_INPUT_DIM(box_tensor, 2); + CHECK_INPUT_DIM(mesh_tensor, 1); + CHECK_INPUT_DIM(t_avg_tensor, 2); + CHECK_INPUT_DIM(t_std_tensor, 2); + + PD_CHECK(sec_r.back() == 0, + "Rotational free descriptor only support all-angular information: " + "sel_r should be all zero."); + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "Number of atoms should be larger than (or equal to) 3"); + // Paddle Set device on Python not in custom op + const int *natoms = natoms_tensor.data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int ntypes = natoms_tensor.shape()[0] - 2; // nloc and nall mean something. + int nsamples = coord_tensor.shape()[0]; + // check the sizes + PD_CHECK(nsamples == atype_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nsamples == box_tensor.shape()[0], "number of samples should match"); + PD_CHECK(ntypes == t_avg_tensor.shape()[0], "number of avg should be ntype"); + PD_CHECK(ntypes == t_std_tensor.shape()[0], "number of std should be ntype"); + PD_CHECK(nall * 3 == coord_tensor.shape()[1], "number of atoms should match"); + PD_CHECK(nall == atype_tensor.shape()[1], "number of atoms should match"); + PD_CHECK(9 == box_tensor.shape()[1], "number of box should be 9"); + PD_CHECK(ndescrpt == t_avg_tensor.shape()[1], + "number of avg should be ndescrpt"); + PD_CHECK(ndescrpt == t_std_tensor.shape()[1], + "number of std should be ndescrpt"); + PD_CHECK(ntypes == int(sel_a.size()), + "number of types should match the length of sel array"); + PD_CHECK(ntypes == int(sel_r.size()), + "number of types should match the length of sel array"); + + int nei_mode = 0; + bool b_nlist_map = false; + if (mesh_tensor.shape()[0] == 16) { + // lammps neighbor list + nei_mode = 3; + } else if (mesh_tensor.shape()[0] == 6) { + // manual copied pbc + assert(nloc == nall); + nei_mode = 1; + b_nlist_map = true; + } else if (mesh_tensor.shape()[0] == 0) { + // no pbc + assert(nloc == nall); + nei_mode = -1; + } else { + PD_THROW("Invalid mesh tensor"); + } + + // Create output tensors shape + std::vector descrpt_shape{nsamples, (int64_t)nloc * ndescrpt}; + std::vector descrpt_deriv_shape{nsamples, + (int64_t)nloc * ndescrpt * 3}; + std::vector rij_shape{nsamples, (int64_t)nloc * nnei * 3}; + std::vector nlist_shape{nsamples, (int64_t)nloc * nnei}; + // define output tensor + paddle::Tensor descrpt_tensor = + paddle::empty(descrpt_shape, coord_tensor.dtype(), coord_tensor.place()); + + paddle::Tensor descrpt_deriv_tensor = paddle::empty( + descrpt_deriv_shape, coord_tensor.dtype(), coord_tensor.place()); + + paddle::Tensor rij_tensor = + paddle::empty(rij_shape, coord_tensor.dtype(), coord_tensor.place()); + + paddle::Tensor nlist_tensor = + paddle::empty(nlist_shape, paddle::DataType::INT32, coord_tensor.place()); + PD_DISPATCH_FLOATING_TYPES( + coord_tensor.type(), "prod_env_mat_a_cpu_forward_kernel", ([&] { + prod_env_mat_a_cpu_forward_kernel( + nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, + max_nbor_size, mesh_tensor.data(), nei_mode, rcut_a, rcut_r, + rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, + sec_r, descrpt_tensor.data(), + descrpt_deriv_tensor.data(), rij_tensor.data(), + nlist_tensor.data(), coord_tensor.data(), + box_tensor.data(), t_avg_tensor.data(), + t_std_tensor.data(), atype_tensor.data()); + })); + + return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor}; +} + +std::vector ProdEnvMatAForward( + const paddle::Tensor &coord_tensor, + const paddle::Tensor &atype_tensor, + const paddle::Tensor &mesh_tensor, + const paddle::Tensor &box_tensor, + const paddle::Tensor &t_avg_tensor, + const paddle::Tensor &t_std_tensor, + const paddle::Tensor &natoms_tensor, + float rcut_a, + float rcut_r, + float rcut_r_smth, + std::vector sel_a, + std::vector sel_r) { + if (coord_tensor.is_gpu()) { + return ProdEnvMatACUDAForward( + coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor, + t_std_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), rcut_a, + rcut_r, rcut_r_smth, sel_a, sel_r); + } else { + return ProdEnvMatACPUForward( + coord_tensor, atype_tensor, mesh_tensor, box_tensor, t_avg_tensor, + t_std_tensor, natoms_tensor, rcut_a, rcut_r, rcut_r_smth, sel_a, sel_r); + } +} + +std::vector> ProdEnvMatAInferShape( + std::vector coord_shape, + std::vector atype_shape, + std::vector box_shape, + std::vector mesh_shape, + std::vector t_avg_shape, + std::vector t_std_shape, + std::vector natoms_shape, + float rcut_a, + float rcut_r, + float rcut_r_smth, + const std::vector &sel_a, + const std::vector &sel_r) { + int64_t nloc = /*natoms[0]*/ 192; + // int64_t nall = /*natoms[1]*/ 192; + + std::vector sec_a; + std::vector sec_r; + deepmd::cum_sum(sec_a, sel_a); + deepmd::cum_sum(sec_r, sel_r); + + int64_t nsamples = coord_shape[0]; + int64_t ndescrpt_a = sec_a.back() * 4; + int64_t ndescrpt_r = sec_r.back() * 1; + int64_t ndescrpt = ndescrpt_a + ndescrpt_r; + + int64_t nnei_a = sec_a.back(); + int64_t nnei_r = sec_r.back(); + int64_t nnei = nnei_a + nnei_r; + + std::vector descrpt_shape = {nsamples, nloc * ndescrpt}; + std::vector descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3}; + std::vector rij_shape = {nsamples, nloc * nnei * 3}; + std::vector nlist_shape = {nsamples, nloc * nnei}; + return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape}; +} + +std::vector ProdEnvMatAInferDtype( + paddle::DataType coord_dtype, + paddle::DataType atype_dtype, + paddle::DataType box_dtype, + paddle::DataType mesh_dtype, + paddle::DataType t_avg_dtype, + paddle::DataType t_std_dtype, + paddle::DataType natoms_dtype) { + return {coord_dtype, coord_dtype, coord_dtype, coord_dtype}; +} + +PD_BUILD_OP(prod_env_mat_a) + .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"}) + .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"}) + .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float", + "sel_a: std::vector", "sel_r: std::vector"}) + .SetKernelFn(PD_KERNEL(ProdEnvMatAForward)) + .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype)); diff --git a/source/lib/paddle_src/prod_env_mat.cu b/source/lib/paddle_src/paddle_prod_env_mat.cu similarity index 62% rename from source/lib/paddle_src/prod_env_mat.cu rename to source/lib/paddle_src/paddle_prod_env_mat.cu index 81270a0c81..42f35bb43e 100644 --- a/source/lib/paddle_src/prod_env_mat.cu +++ b/source/lib/paddle_src/paddle_prod_env_mat.cu @@ -1,25 +1,28 @@ #include #include #include +#include #include "paddle/extension.h" #define GOOGLE_CUDA 1 #include -#include "utilities.h" +#include + #include "coord.h" #include "fmt_nlist.h" -#include "region.h" +#include "gpu_cuda.h" #include "neighbor_list.h" #include "prod_env_mat.h" -#include "gpu_cuda.h" -#include +#include "region.h" +#include "utilities.h" typedef long long int_64; #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") -// #define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") __device__ inline double _sqrt(double x) { return sqrt(x); } __device__ inline float _sqrt(float x) { return sqrtf(x); } @@ -27,71 +30,63 @@ __device__ inline double _rsqrt(double x) { return rsqrt(x); } __device__ inline float _rsqrt(float x) { return rsqrtf(x); } template -static int -_norm_copy_coord_gpu( - std::vector* tensor_list, - FPTYPE *&coord_cpy, - int *&type_cpy, - int *&idx_mapping, - int &nall, - int &mem_cpy, - const FPTYPE *coord, - const FPTYPE *box, - const int *type, - const int &nloc, - const int &max_cpy_trial, - const float &rcut_r); +static int _norm_copy_coord_gpu(std::vector* tensor_list, + FPTYPE*& coord_cpy, + int*& type_cpy, + int*& idx_mapping, + int& nall, + int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const int& max_cpy_trial, + const float& rcut_r); template -static int -_build_nlist_gpu( - std::vector *tensor_list, - int *&ilist, - int *&numneigh, - int **&firstneigh, - int *&jlist, - int &max_nnei, - int &mem_nnei, - const FPTYPE *coord, - const int &nloc, - const int &new_nall, - const int &max_nnei_trial, - const float &rcut_r); - -static void -_map_nlist_gpu( - int *nlist, - const int *idx_mapping, - const int &nloc, - const int &nnei); +static int _build_nlist_gpu(std::vector* tensor_list, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int& max_nnei, + int& mem_nnei, + const FPTYPE* coord, + const int& nloc, + const int& new_nall, + const int& max_nnei_trial, + const float& rcut_r); + +static void _map_nlist_gpu(int* nlist, + const int* idx_mapping, + const int& nloc, + const int& nnei); template -static void -_prepare_coord_nlist_gpu( - std::vector *tensor_list, - FPTYPE const **coord, - FPTYPE *&coord_cpy, - int const **type, - int *&type_cpy, - int *&idx_mapping, - deepmd::InputNlist &inlist, - int *&ilist, - int *&numneigh, - int **&firstneigh, - int *&jlist, - int *&nbor_list_dev, - int &new_nall, - int &mem_cpy, - int &mem_nnei, - int &max_nbor_size, - const FPTYPE *box, - const int *mesh_tensor_data, - const int mesh_tensor_size, - const int &nloc, - const int &nei_mode, - const float &rcut_r, - const int &max_cpy_trial, - const int &max_nnei_trial); +static void _prepare_coord_nlist_gpu(std::vector* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial); template __device__ inline uint_64 encoding_nbor_info(const int type, @@ -161,7 +156,6 @@ __launch_bounds__(BLOCK_THREADS) __global__ items); } - template __device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) { return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2]; @@ -426,7 +420,6 @@ void format_nbor_list_4096(uint_64* key, DPErrcheck(cudaDeviceSynchronize()); } - template __global__ void compute_env_mat_a(FPTYPE* em, FPTYPE* em_deriv, @@ -663,7 +656,6 @@ void format_nbor_list_gpu_cuda(int* nlist, sizeof(uint_64) * int_64(nloc) * max_nbor_size)); DPErrcheck(cudaMemcpy(sec_dev, &sec[0], sizeof(int) * sec.size(), cudaMemcpyHostToDevice)); - get_i_idx<<>>(i_idx, nloc, gpu_inlist.ilist); DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); @@ -688,7 +680,7 @@ void format_nbor_list_gpu_cuda(int* nlist, DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } -} +} // namespace deepmd namespace deepmd { @@ -769,94 +761,111 @@ void prod_env_mat_r_gpu_cuda(FPTYPE* em, DPErrcheck(cudaGetLastError()); DPErrcheck(cudaDeviceSynchronize()); } -} - +} // namespace deepmd template -void prod_env_mat_a_cuda_forward_kernel( - int nsamples, int nloc, int ndescrpt, int nnei, int nall, int mem_cpy, int mem_nnei, - int max_nbor_size, int nei_mode, float rcut_a, float rcut_r, float rcut_r_smth, int max_cpy_trial, - int max_nnei_trial, bool b_nlist_map, const std::vector& sec_a, - const std::vector& sec_r, deepmd::InputNlist gpu_inlist, int* nbor_list_dev, int* array_int, unsigned long long* array_longlong, - data_t *p_em, data_t *p_em_deriv, data_t *p_rij, int *p_nlist, - const data_t *p_coord, const data_t *p_box, const data_t *avg, - const data_t *std, const int *p_type, const paddle::Tensor& mesh_tensor) -{ - - for (int ff = 0; ff < nsamples; ++ff) - { - data_t *em = p_em + ff * nloc * ndescrpt; - data_t *em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; - data_t *rij = p_rij + ff * nloc * nnei * 3; - int *nlist = p_nlist + ff * nloc * nnei; - const data_t *coord = p_coord + ff * nall * 3; - const data_t *box = p_box + ff * 9; - const int *type = p_type + ff * nall; - - - int *idx_mapping = NULL; - int *ilist = NULL, *numneigh = NULL; - int **firstneigh = NULL; - deepmd::malloc_device_memory(firstneigh, nloc); - int *jlist = NULL; - data_t *coord_cpy; - int *type_cpy; - int frame_nall = nall; - int mesh_tensor_size = static_cast(mesh_tensor.size()); - std::vector tensor_list; - _prepare_coord_nlist_gpu( - &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping, - gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, - frame_nall, mem_cpy, mem_nnei, max_nbor_size, - box, mesh_tensor.data(), mesh_tensor_size, nloc, nei_mode, rcut_r, max_cpy_trial, max_nnei_trial); - // allocate temp memory, temp memory must not be used after this operation! - std::vector int_temp_shape{int(sec_a.size()) + nloc * int(sec_a.size()) + nloc}; - auto int_temp = paddle::empty( - int_temp_shape, - paddle::DataType::FLOAT32, - paddle::GPUPlace() - ); - - array_int = int_temp.mutable_data(); - - deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2); - // launch the gpu(nv) compute function - - deepmd::prod_env_mat_a_gpu_cuda( - em, em_deriv, rij, nlist, - coord, type, gpu_inlist, array_int, array_longlong, max_nbor_size, avg, std, nloc, frame_nall, rcut_r, rcut_r_smth, sec_a); - if (b_nlist_map) - _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); - deepmd::delete_device_memory(firstneigh); - deepmd::delete_device_memory(array_longlong); - array_longlong = NULL; - } -} - -void cum_sum(std::vector& sec, const std::vector& n_sel) { - sec.resize(n_sel.size() + 1); - sec[0] = 0; - for (int ii = 1; ii < sec.size(); ++ii) { - sec[ii] = sec[ii - 1] + n_sel[ii - 1]; +void prod_env_mat_a_cuda_forward_kernel(int nsamples, + int nloc, + int ndescrpt, + int nnei, + int nall, + int mem_cpy, + int mem_nnei, + int max_nbor_size, + int nei_mode, + float rcut_a, + float rcut_r, + float rcut_r_smth, + int max_cpy_trial, + int max_nnei_trial, + bool b_nlist_map, + const std::vector& sec_a, + const std::vector& sec_r, + deepmd::InputNlist gpu_inlist, + int* nbor_list_dev, + int* array_int, + unsigned long long* array_longlong, + data_t* p_em, + data_t* p_em_deriv, + data_t* p_rij, + int* p_nlist, + const data_t* p_coord, + const data_t* p_box, + const data_t* avg, + const data_t* std, + const int* p_type, + const paddle::Tensor& mesh_tensor) { + for (int ff = 0; ff < nsamples; ++ff) { + data_t* em = p_em + ff * nloc * ndescrpt; + data_t* em_deriv = p_em_deriv + ff * nloc * ndescrpt * 3; + data_t* rij = p_rij + ff * nloc * nnei * 3; + int* nlist = p_nlist + ff * nloc * nnei; + const data_t* coord = p_coord + ff * nall * 3; + const data_t* box = p_box + ff * 9; + const int* type = p_type + ff * nall; + + int* idx_mapping = NULL; + int *ilist = NULL, *numneigh = NULL; + int** firstneigh = NULL; + deepmd::malloc_device_memory(firstneigh, nloc); + int* jlist = NULL; + data_t* coord_cpy; + int* type_cpy; + int frame_nall = nall; + int mesh_tensor_size = static_cast(mesh_tensor.size()); + std::vector tensor_list; + // std::vector tensor_list(7); // >>> + // 参照deepmd-kit-tf/source/op/prod_env_mat_multi_device.cc修改 + _prepare_coord_nlist_gpu( + &tensor_list, &coord, coord_cpy, &type, type_cpy, idx_mapping, + gpu_inlist, ilist, numneigh, firstneigh, jlist, nbor_list_dev, + frame_nall, mem_cpy, mem_nnei, max_nbor_size, box, + mesh_tensor.data(), mesh_tensor_size, nloc, nei_mode, rcut_r, + max_cpy_trial, max_nnei_trial); + + // allocate temp memory, temp memory must not be used after this operation! + std::vector int_temp_shape{int(sec_a.size()) + + nloc * int(sec_a.size()) + nloc}; + auto int_temp = paddle::empty(int_temp_shape, paddle::DataType::INT32, + paddle::GPUPlace()); + + array_int = int_temp.data(); + + deepmd::malloc_device_memory(array_longlong, nloc * GPU_MAX_NBOR_SIZE * 2); + // launch the gpu(nv) compute function + + deepmd::prod_env_mat_a_gpu_cuda(em, em_deriv, rij, nlist, coord, type, + gpu_inlist, array_int, array_longlong, + max_nbor_size, avg, std, nloc, frame_nall, + rcut_r, rcut_r_smth, sec_a); + if (b_nlist_map) _map_nlist_gpu(nlist, idx_mapping, nloc, nnei); + deepmd::delete_device_memory(firstneigh); + deepmd::delete_device_memory(array_longlong); + array_longlong = NULL; } } - -std::vector prod_env_mat_a_cuda_forward( - const paddle::Tensor& coord_tensor, - const paddle::Tensor& atype_tensor, - const paddle::Tensor& box_tensor, - const paddle::Tensor& mesh_tensor, - const paddle::Tensor& t_avg_tensor, - const paddle::Tensor& t_std_tensor, - const paddle::Tensor& natoms_tensor, - float rcut_a, - float rcut_r, - float rcut_r_smth, - std::vector sel_a, - std::vector sel_r -) -{ +// void cum_sum(std::vector& sec, const std::vector& n_sel) { +// sec.resize(n_sel.size() + 1); +// sec[0] = 0; +// for (int ii = 1; ii < sec.size(); ++ii) { +// sec[ii] = sec[ii - 1] + n_sel[ii - 1]; +// } +// } + +std::vector ProdEnvMatACUDAForward( + const paddle::Tensor& coord_tensor, + const paddle::Tensor& atype_tensor, + const paddle::Tensor& box_tensor, + const paddle::Tensor& mesh_tensor, + const paddle::Tensor& t_avg_tensor, + const paddle::Tensor& t_std_tensor, + const paddle::Tensor& natoms_tensor, + float rcut_a, + float rcut_r, + float rcut_r_smth, + std::vector sel_a, + std::vector sel_r) { std::vector sec_a; std::vector sec_r; int ndescrpt, ndescrpt_a, ndescrpt_r; @@ -864,18 +873,17 @@ std::vector prod_env_mat_a_cuda_forward( int mem_cpy, max_cpy_trial; int mem_nnei, max_nnei_trial; std::string device; - int *array_int = NULL; - unsigned long long *array_longlong = NULL; + int* array_int = NULL; + unsigned long long* array_longlong = NULL; deepmd::InputNlist gpu_inlist; - int *nbor_list_dev = NULL; - float nloc_f, nall_f; + int* nbor_list_dev = NULL; + // float nloc_f, nall_f; - cum_sum(sec_a, sel_a); - cum_sum(sec_r, sel_r); + deepmd::cum_sum(sec_a, sel_a); + deepmd::cum_sum(sec_r, sel_r); ndescrpt_a = sec_a.back() * 4; ndescrpt_r = sec_r.back() * 1; ndescrpt = ndescrpt_a + ndescrpt_r; - // std::cout << "ndescrpt = " << ndescrpt << std::endl; nnei_a = sec_a.back(); nnei_r = sec_r.back(); nnei = nnei_a + nnei_r; @@ -884,32 +892,11 @@ std::vector prod_env_mat_a_cuda_forward( mem_cpy = 256; max_nnei_trial = 100; mem_nnei = 256; - // std::cout << "natoms.dtype = " << natoms.dtype() << std::endl; - // std::cout << "natoms.shape = "; - // for (auto &x: natoms) - // { - // std::cout << x << std::endl; - // } - // std::cout << std::endl; - - // std::cout << << std::endl; - // std::cout << "natoms.numel = " << natoms.numel() << std::endl; - // std::cout << "ckpt 1===============" << std::endl; - // auto* natoms = natoms.data(); - // std::cout << "natoms.numel() = " << natoms.numel() << std::endl; - // std::cout << "ckpt 2===============" << std::endl; - // std::cout << natoms[0] << std::endl; + CHECK_INPUT_ON_CPU(natoms_tensor); auto natoms = natoms_tensor.data(); - int nloc = natoms[0]; // TODO: 使用natoms[0] 会段错误 - // std::cout << "nloc = " << nloc << std::endl; - // std::cout << "ckpt 3===============" << std::endl; - int nall = natoms[1]; // TODO: 使用natoms[1] 会段错误 - // std::cout << "nall = " << nloc << std::endl; - // std::cout << "ckpt 4===============" << std::endl; - // int ntypes = natoms.shape()[0] - 2; - // std::cout << "ckpt 5===============" << std::endl; + int nloc = natoms[0]; + int nall = natoms[1]; int nsamples = coord_tensor.shape()[0]; - // std::cout << "ckpt 6===============" << std::endl; int nei_mode = 0; bool b_nlist_map = false; @@ -931,394 +918,261 @@ std::vector prod_env_mat_a_cuda_forward( // create output tensors auto descrpt_tensor = paddle::empty( - {nsamples, nloc * ndescrpt}, - coord_tensor.dtype(), - coord_tensor.place() - ); - // std::cout << "descrpt_tensor.shape = "; - // for (auto &x: descrpt_tensor.shape()) - // std::cout << x << " "; - // std::cout << std::endl; - - auto descrpt_deriv_tensor = paddle::empty( - {nsamples, nloc * ndescrpt * 3}, - coord_tensor.dtype(), - coord_tensor.place() - ); - // std::cout << "descrpt_deriv_tensor.shape = "; - // for (auto &x: descrpt_deriv_tensor.shape()) - // std::cout << x << " "; - // std::cout << std::endl; - - auto rij_tensor = paddle::empty( - {nsamples, nloc * nnei * 3}, - coord_tensor.dtype(), - coord_tensor.place() - ); - // std::cout << "rij_tensor.shape = "; - // for (auto &x: rij_tensor.shape()) - // std::cout << x << " "; - // std::cout << std::endl; + {nsamples, nloc * ndescrpt}, coord_tensor.dtype(), coord_tensor.place()); + + auto descrpt_deriv_tensor = + paddle::empty({nsamples, nloc * ndescrpt * 3}, coord_tensor.dtype(), + coord_tensor.place()); + + auto rij_tensor = paddle::empty({nsamples, nloc * nnei * 3}, + coord_tensor.dtype(), coord_tensor.place()); auto nlist_tensor = paddle::empty( - {nsamples, nloc * nnei}, - coord_tensor.dtype(), - coord_tensor.place() - ); - // std::cout << "nlist_tensor.shape = "; - // for (auto &x: nlist_tensor.shape()) - // std::cout << x << " "; - // std::cout << std::endl; + {nsamples, nloc * nnei}, paddle::DataType::INT32, coord_tensor.place()); // loop over samples PD_DISPATCH_FLOATING_TYPES( - coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] { + coord_tensor.type(), "prod_env_mat_a_cuda_forward_kernel", ([&] { prod_env_mat_a_cuda_forward_kernel( - nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, max_nbor_size, - nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, max_nnei_trial, b_nlist_map, sec_a, sec_r, - gpu_inlist, nbor_list_dev, array_int, array_longlong, - descrpt_tensor.mutable_data(), - descrpt_deriv_tensor.mutable_data(), - rij_tensor.mutable_data(), - nlist_tensor.mutable_data(), + nsamples, nloc, ndescrpt, nnei, nall, mem_cpy, mem_nnei, + max_nbor_size, nei_mode, rcut_a, rcut_r, rcut_r_smth, max_cpy_trial, + max_nnei_trial, b_nlist_map, sec_a, sec_r, gpu_inlist, + nbor_list_dev, array_int, array_longlong, + descrpt_tensor.data(), descrpt_deriv_tensor.data(), + rij_tensor.data(), nlist_tensor.data(), coord_tensor.data(), box_tensor.copy_to(paddle::CPUPlace(), false).data(), - t_avg_tensor.data(), - t_std_tensor.data(), - atype_tensor.data(), - mesh_tensor); - })); + // box_tensor.data(), + t_avg_tensor.data(), t_std_tensor.data(), + atype_tensor.data(), mesh_tensor); + })); return {descrpt_tensor, descrpt_deriv_tensor, rij_tensor, nlist_tensor}; } template -static int -_norm_copy_coord_gpu( - std::vector* tensor_list, - FPTYPE *&coord_cpy, - int *&type_cpy, - int *&idx_mapping, - int &nall, - int &mem_cpy, - const FPTYPE *coord, - const FPTYPE *box, - const int *type, - const int &nloc, - const int &max_cpy_trial, - const float &rcut_r) -{ - // Tensor FPTYPE_temp; - std::vector FPTYPE_temp_shape{nall*3}; - paddle::Tensor tmp_coord_tensor = paddle::Tensor(paddle::PlaceType::kGPU, FPTYPE_temp_shape); - FPTYPE *tmp_coord = tmp_coord_tensor.mutable_data(paddle::PlaceType::kGPU); - tensor_list->push_back(tmp_coord_tensor); - cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, cudaMemcpyDeviceToDevice); - - deepmd::Region region; - deepmd::init_region_cpu(region, box); - FPTYPE box_info[18]; - std::copy(region.boxt, region.boxt + 9, box_info); - std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9); - int cell_info[23]; - deepmd::compute_cell_info(cell_info, rcut_r, region); - const int loc_cellnum = cell_info[21]; - const int total_cellnum = cell_info[22]; - - //Tensor double_temp; - std::vector double_temp_shape {18}; - paddle::Tensor double_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, double_temp_shape); - FPTYPE *box_info_dev = double_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - tensor_list->push_back(double_temp_tensor); - - //Tensor int_temp; - std::vector int_temp_shape {23+nloc*3+loc_cellnum+total_cellnum*3+total_cellnum*3+loc_cellnum+1+total_cellnum+1+nloc}; - paddle::Tensor int_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, int_temp_shape); - int *cell_info_dev = int_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - int *int_data_dev = cell_info_dev + 23; - tensor_list->push_back(int_temp_tensor); - - deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); - deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); - - deepmd::Region region_dev; - FPTYPE *new_boxt = region_dev.boxt; - FPTYPE *new_rec_boxt = region_dev.rec_boxt; - region_dev.boxt = box_info_dev; - region_dev.rec_boxt = box_info_dev + 9; - - deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); - - - int tt; - paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); - paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); - for (tt = 0; tt < max_cpy_trial; ++tt) - { - std::vector cpy_temp_shape {mem_cpy * 3}; - std::vector t_temp_shape {mem_cpy * 2}; - cpy_temp_tensor.reshape(cpy_temp_shape); - coord_cpy = cpy_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - t_temp_tensor.reshape(t_temp_shape); - type_cpy = t_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - - idx_mapping = type_cpy + mem_cpy; - int ret = deepmd::copy_coord_gpu( - coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, - tmp_coord, type, nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev); - if (ret == 0) - { - break; - } - else - { - mem_cpy *= 2; - } +static int _norm_copy_coord_gpu(std::vector* tensor_list, + FPTYPE*& coord_cpy, + int*& type_cpy, + int*& idx_mapping, + int& nall, + int& mem_cpy, + const FPTYPE* coord, + const FPTYPE* box, + const int* type, + const int& nloc, + const int& max_cpy_trial, + const float& rcut_r) { + // Tensor FPTYPE_temp; + std::vector FPTYPE_temp_shape{nall * 3}; + + + // use type trait to determine the data type + paddle::Tensor tmp_coord_tensor; + if (std::is_same::value) { + tmp_coord_tensor = paddle::empty( + FPTYPE_temp_shape, paddle::DataType::FLOAT32, paddle::GPUPlace() + ); + } else if (std::is_same::value) { + tmp_coord_tensor = paddle::empty( + FPTYPE_temp_shape, paddle::DataType::FLOAT64, paddle::GPUPlace() + ); + } else { + PD_THROW("invalid data type"); + } + FPTYPE* tmp_coord = tmp_coord_tensor.data(); + tensor_list->push_back(tmp_coord_tensor); + cudaMemcpy(tmp_coord, coord, sizeof(FPTYPE) * nall * 3, + cudaMemcpyDeviceToDevice); + + deepmd::Region region; + deepmd::init_region_cpu(region, box); + FPTYPE box_info[18]; + std::copy(region.boxt, region.boxt + 9, box_info); + std::copy(region.rec_boxt, region.rec_boxt + 9, box_info + 9); + int cell_info[23]; + deepmd::compute_cell_info(cell_info, rcut_r, region); + const int loc_cellnum = cell_info[21]; + const int total_cellnum = cell_info[22]; + + // Tensor double_temp; + std::vector double_temp_shape{18}; + paddle::Tensor double_temp_tensor = paddle::empty( + double_temp_shape, tmp_coord_tensor.dtype(), paddle::GPUPlace()); + FPTYPE* box_info_dev = double_temp_tensor.data(); + tensor_list->push_back(double_temp_tensor); + + // Tensor int_temp; + std::vector int_temp_shape{ + 23 + nloc * 3 + loc_cellnum + total_cellnum * 3 + total_cellnum * 3 + + loc_cellnum + 1 + total_cellnum + 1 + nloc}; + paddle::Tensor int_temp_tensor = paddle::empty( + int_temp_shape, paddle::DataType::INT32, paddle::GPUPlace()); + int* cell_info_dev = int_temp_tensor.data(); + int* int_data_dev = cell_info_dev + 23; + tensor_list->push_back(int_temp_tensor); + + deepmd::memcpy_host_to_device(box_info_dev, box_info, 18); + deepmd::memcpy_host_to_device(cell_info_dev, cell_info, 23); + + deepmd::Region region_dev; + FPTYPE* new_boxt = region_dev.boxt; + FPTYPE* new_rec_boxt = region_dev.rec_boxt; + region_dev.boxt = box_info_dev; + region_dev.rec_boxt = box_info_dev + 9; + + deepmd::normalize_coord_gpu(tmp_coord, nall, region_dev); + + int tt; + paddle::Tensor cpy_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); + paddle::Tensor t_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); + for (tt = 0; tt < max_cpy_trial; ++tt) { + std::vector cpy_temp_shape{mem_cpy * 3}; + std::vector t_temp_shape{mem_cpy * 2}; + cpy_temp_tensor.reshape(cpy_temp_shape); + coord_cpy = cpy_temp_tensor.mutable_data(paddle::PlaceType::kGPU); + t_temp_tensor.reshape(t_temp_shape); + type_cpy = t_temp_tensor.mutable_data(paddle::PlaceType::kGPU); + + idx_mapping = type_cpy + mem_cpy; + int ret = deepmd::copy_coord_gpu( + coord_cpy, type_cpy, idx_mapping, &nall, int_data_dev, tmp_coord, type, + nloc, mem_cpy, loc_cellnum, total_cellnum, cell_info_dev, region_dev); + if (ret == 0) { + break; + } else { + mem_cpy *= 2; } - tensor_list->push_back(cpy_temp_tensor); - tensor_list->push_back(t_temp_tensor); - region_dev.boxt = new_boxt; - region_dev.rec_boxt = new_rec_boxt; + } + tensor_list->push_back(cpy_temp_tensor); + tensor_list->push_back(t_temp_tensor); + region_dev.boxt = new_boxt; + region_dev.rec_boxt = new_rec_boxt; - return (tt != max_cpy_trial); + return (tt != max_cpy_trial); } template -static int -_build_nlist_gpu( - std::vector *tensor_list, - int *&ilist, - int *&numneigh, - int **&firstneigh, - int *&jlist, - int &max_nnei, - int &mem_nnei, - const FPTYPE *coord, - const int &nloc, - const int &new_nall, - const int &max_nnei_trial, - const float &rcut_r) -{ - //Tensor nlist_temp; - std::vector nlist_temp_shape {nloc * 2}; - paddle::Tensor nlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU, nlist_temp_shape); - ilist = nlist_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - tensor_list->push_back(nlist_temp_tensor); - numneigh = ilist + nloc; - //Tensor jlist_temp; - int *ind_data = NULL; - - std::vector firstneigh_host(nloc); - int tt; - paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); - for (tt = 0; tt < max_nnei_trial; ++tt) - { - std::vector jlist_temp_shape {3 * nloc * mem_nnei}; - jlist_temp_tensor.reshape(jlist_temp_shape); - jlist = jlist_temp_tensor.mutable_data(paddle::PlaceType::kGPU); - ind_data = jlist + nloc * mem_nnei; - for (int ii = 0; ii < nloc; ++ii) - { - firstneigh_host[ii] = jlist + ii * mem_nnei; - } - deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); - deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); - int ret = deepmd::build_nlist_gpu( - inlist, &max_nnei, ind_data, - coord, nloc, new_nall, mem_nnei, rcut_r); - if (ret == 0) - { - break; - } - else - { - mem_nnei *= 2; - } +static int _build_nlist_gpu(std::vector* tensor_list, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int& max_nnei, + int& mem_nnei, + const FPTYPE* coord, + const int& nloc, + const int& new_nall, + const int& max_nnei_trial, + const float& rcut_r) { + // Tensor nlist_temp; + std::vector nlist_temp_shape{nloc * 2}; + paddle::Tensor nlist_temp_tensor = paddle::empty( + nlist_temp_shape, paddle::DataType::INT32, paddle::GPUPlace()); + ilist = nlist_temp_tensor.data(); + tensor_list->push_back(nlist_temp_tensor); + numneigh = ilist + nloc; + // Tensor jlist_temp; + int* ind_data = NULL; + + std::vector firstneigh_host(nloc); + int tt; + paddle::Tensor jlist_temp_tensor = paddle::Tensor(paddle::PlaceType::kGPU); + for (tt = 0; tt < max_nnei_trial; ++tt) { + std::vector jlist_temp_shape{3 * nloc * mem_nnei}; + jlist_temp_tensor.reshape(jlist_temp_shape); + jlist = jlist_temp_tensor.mutable_data(paddle::PlaceType::kGPU); + ind_data = jlist + nloc * mem_nnei; + for (int ii = 0; ii < nloc; ++ii) { + firstneigh_host[ii] = jlist + ii * mem_nnei; + } + deepmd::memcpy_host_to_device(firstneigh, firstneigh_host); + deepmd::InputNlist inlist(nloc, ilist, numneigh, firstneigh); + int ret = deepmd::build_nlist_gpu(inlist, &max_nnei, ind_data, coord, nloc, + new_nall, mem_nnei, rcut_r); + if (ret == 0) { + break; + } else { + mem_nnei *= 2; } - tensor_list->push_back(jlist_temp_tensor); - return (tt != max_nnei_trial); + } + tensor_list->push_back(jlist_temp_tensor); + return (tt != max_nnei_trial); } -static void -_map_nlist_gpu( - int *nlist, - const int *idx_mapping, - const int &nloc, - const int &nnei) -{ - deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei); +static void _map_nlist_gpu(int* nlist, + const int* idx_mapping, + const int& nloc, + const int& nnei) { + deepmd::use_nlist_map(nlist, idx_mapping, nloc, nnei); } template -static void -_prepare_coord_nlist_gpu( - std::vector *tensor_list, - FPTYPE const **coord, - FPTYPE *&coord_cpy, - int const **type, - int *&type_cpy, - int *&idx_mapping, - deepmd::InputNlist &inlist, - int *&ilist, - int *&numneigh, - int **&firstneigh, - int *&jlist, - int *&nbor_list_dev, - int &new_nall, - int &mem_cpy, - int &mem_nnei, - int &max_nbor_size, - const FPTYPE *box, - const int *mesh_tensor_data, - const int mesh_tensor_size, - const int &nloc, - const int &nei_mode, - const float &rcut_r, - const int &max_cpy_trial, - const int &max_nnei_trial) -{ - inlist.inum = nloc; - if (nei_mode != 3) - { - // build nlist by myself - // normalize and copy coord - if (nei_mode == 1) - { - int copy_ok = _norm_copy_coord_gpu( - tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy, - *coord, box, *type, nloc, max_cpy_trial, rcut_r); - PD_CHECK(copy_ok, "cannot allocate mem for copied coords"); - *coord = coord_cpy; - *type = type_cpy; - - } - - //build nlist - int build_ok = _build_nlist_gpu( - tensor_list, ilist, numneigh, firstneigh, jlist, max_nbor_size, mem_nnei, - *coord, nloc, new_nall, max_nnei_trial, rcut_r); - PD_CHECK(build_ok, "cannot allocate mem for nlist"); - if (max_nbor_size <= 1024) - { - max_nbor_size = 1024; - } - else if (max_nbor_size <= 2048) - { - max_nbor_size = 2048; - } - else - { - max_nbor_size = 4096; - } - inlist.ilist = ilist; - inlist.numneigh = numneigh; - inlist.firstneigh = firstneigh; +static void _prepare_coord_nlist_gpu(std::vector* tensor_list, + FPTYPE const** coord, + FPTYPE*& coord_cpy, + int const** type, + int*& type_cpy, + int*& idx_mapping, + deepmd::InputNlist& inlist, + int*& ilist, + int*& numneigh, + int**& firstneigh, + int*& jlist, + int*& nbor_list_dev, + int& new_nall, + int& mem_cpy, + int& mem_nnei, + int& max_nbor_size, + const FPTYPE* box, + const int* mesh_tensor_data, + const int mesh_tensor_size, + const int& nloc, + const int& nei_mode, + const float& rcut_r, + const int& max_cpy_trial, + const int& max_nnei_trial) { + inlist.inum = nloc; + if (nei_mode != 3) { + // build nlist by myself + // normalize and copy coord + if (nei_mode == 1) { + int copy_ok = _norm_copy_coord_gpu( + tensor_list, coord_cpy, type_cpy, idx_mapping, new_nall, mem_cpy, + *coord, box, *type, nloc, max_cpy_trial, rcut_r); + PD_CHECK(copy_ok, "cannot allocate mem for copied coords"); + *coord = coord_cpy; + *type = type_cpy; } - else - { - // update nbor list - deepmd::InputNlist inlist_temp; - inlist_temp.inum = nloc; - deepmd::env_mat_nbor_update( - inlist_temp, inlist, max_nbor_size, nbor_list_dev, - mesh_tensor_data, mesh_tensor_size); - // env_mat_nbor_update( - // inlist_temp, inlist, max_nbor_size, nbor_list_dev, - // mesh_tensor_data, mesh_tensor_size); - PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), "Assert failed, max neighbor size of atom(lammps) " + std::to_string(max_numneigh(inlist_temp)) + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + ", which currently is not supported by deepmd-kit."); - } -} - -std::vector ProdEnvMatAForward( - const paddle::Tensor& coord_tensor, - const paddle::Tensor& atype_tensor, - const paddle::Tensor& mesh_tensor, - const paddle::Tensor& box_tensor, - const paddle::Tensor& t_avg_tensor, - const paddle::Tensor& t_std_tensor, - const paddle::Tensor& natoms_tensor, - float rcut_a, - float rcut_r, - float rcut_r_smth, - std::vector sel_a, - std::vector sel_r -) { - if (coord_tensor.is_gpu()) { - return prod_env_mat_a_cuda_forward( - coord_tensor, - atype_tensor, - mesh_tensor, - box_tensor, - t_avg_tensor, - t_std_tensor, - natoms_tensor, - rcut_a, - rcut_r, - rcut_r_smth, - sel_a, - sel_r - ); + // build nlist + int build_ok = _build_nlist_gpu(tensor_list, ilist, numneigh, firstneigh, + jlist, max_nbor_size, mem_nnei, *coord, + nloc, new_nall, max_nnei_trial, rcut_r); + PD_CHECK(build_ok, "cannot allocate mem for nlist"); + if (max_nbor_size <= 1024) { + max_nbor_size = 1024; + } else if (max_nbor_size <= 2048) { + max_nbor_size = 2048; + } else { + max_nbor_size = 4096; + } + inlist.ilist = ilist; + inlist.numneigh = numneigh; + inlist.firstneigh = firstneigh; } else { - PD_THROW("Unsupported device type for forward function of custom relu operator."); + // update nbor list + deepmd::InputNlist inlist_temp; + inlist_temp.inum = nloc; + deepmd::env_mat_nbor_update(inlist_temp, inlist, max_nbor_size, + nbor_list_dev, mesh_tensor_data, + mesh_tensor_size); + // env_mat_nbor_update( + // inlist_temp, inlist, max_nbor_size, nbor_list_dev, + // mesh_tensor_data, mesh_tensor_size); + PD_CHECK((max_numneigh(inlist_temp) <= GPU_MAX_NBOR_SIZE), + "Assert failed, max neighbor size of atom(lammps) " + + std::to_string(max_numneigh(inlist_temp)) + + " is larger than " + std::to_string(GPU_MAX_NBOR_SIZE) + + ", which currently is not supported by deepmd-kit."); } } - - -std::vector> ProdEnvMatAInferShape( - std::vector coord_shape, - std::vector atype_shape, - std::vector box_shape, - std::vector mesh_shape, - std::vector t_avg_shape, - std::vector t_std_shape, - std::vector natoms_shape, - float rcut_a, - float rcut_r, - float rcut_r_smth, - const std::vector& sel_a, - const std::vector& sel_r -) { - int64_t nloc = /*natoms[0]*/ 192; - int64_t nall = /*natoms[1]*/ 192; - - std::vector sec_a; - std::vector sec_r; - cum_sum(sec_a, sel_a); - cum_sum(sec_r, sel_r); - - int64_t nsamples = coord_shape[0]; - int64_t ndescrpt_a = sec_a.back() * 4; - int64_t ndescrpt_r = sec_r.back() * 1; - int64_t ndescrpt = ndescrpt_a + ndescrpt_r; - - int64_t nnei_a = sec_a.back(); - int64_t nnei_r = sec_r.back(); - int64_t nnei = nnei_a + nnei_r; - - std::vector descrpt_shape = {nsamples, nloc * ndescrpt}; - std::vector descrpt_deriv_shape = {nsamples, nloc * ndescrpt * 3}; - std::vector rij_shape = {nsamples, nloc * nnei * 3}; - std::vector nlist_shape = {nsamples, nloc * nnei}; - return {descrpt_shape, descrpt_deriv_shape, rij_shape, nlist_shape}; -} - -std::vector ProdEnvMatAInferDtype( - paddle::DataType coord_dtype, - paddle::DataType atype_dtype, - paddle::DataType box_dtype, - paddle::DataType mesh_dtype, - paddle::DataType t_avg_dtype, - paddle::DataType t_std_dtype, - paddle::DataType natoms_dtype -) { - return {coord_dtype, coord_dtype, coord_dtype, coord_dtype}; -} - - -PD_BUILD_OP(prod_env_mat_a) - .Inputs({"coord", "atype", "box", "mesh", "t_avg", "t_std", "natoms"}) - .Outputs({"descrpt", "descrpt_deriv", "rij", "nlist"}) - .Attrs({"rcut_a: float", "rcut_r: float", "rcut_r_smth: float", "sel_a: std::vector", "sel_r: std::vector"}) - .SetKernelFn(PD_KERNEL(ProdEnvMatAForward)) - .SetInferShapeFn(PD_INFER_SHAPE(ProdEnvMatAInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(ProdEnvMatAInferDtype)); \ No newline at end of file diff --git a/source/lib/paddle_src/paddle_prod_force.cc b/source/lib/paddle_src/paddle_prod_force.cc new file mode 100644 index 0000000000..95c6671984 --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_force.cc @@ -0,0 +1,142 @@ +#include "paddle/extension.h" +#include "prod_force.h" + +#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") + +template +void ProdForceSeAOpForwardCPUKernel(int nloc, + int nall, + int nframes, + int ndescrpt, + int nnei, + data_t* p_force, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const int* p_nlist) { + for (int kk = 0; kk < nframes; ++kk) { + data_t* force = p_force + kk * nall * 3; + const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_force_a_cpu(force, net_deriv, in_deriv, nlist, nloc, nall, + nnei, 0); + } +} + +std::vector ProdForceSeAOpCPUForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT(net_deriv_tensor); + CHECK_INPUT(in_deriv_tensor); + CHECK_INPUT(nlist_tensor); + CHECK_INPUT(natoms_tensor); + + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + const int* natoms = natoms_tensor.data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int nframes = net_deriv_tensor.shape()[0]; + int ndescrpt = net_deriv_tensor.shape()[1] / nloc; + int nnei = nlist_tensor.shape()[1] / nloc; + + PD_CHECK(nframes == in_deriv_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nframes == nlist_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], + "number of descriptors should match"); + + std::vector force_shape{nframes, 3 * nall}; + paddle::Tensor force_tensor = paddle::empty( + force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place()); + + assert(nframes == force_shape[0]); + assert(nframes == net_deriv_tensor.shape()[0]); + assert(nframes == in_deriv_tensor.shape()[0]); + assert(nframes == nlist_tensor.shape()[0]); + assert(nall * 3 == force_shape[1]); + assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]); + assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]); + assert(nloc * nnei == nlist_tensor.shape()[1]); + assert(nnei * 4 == ndescrpt); + + PD_DISPATCH_FLOATING_TYPES( + net_deriv_tensor.type(), "prod_force_se_a_cpu_forward_kernel", ([&] { + ProdForceSeAOpForwardCPUKernel( + nloc, nall, nframes, ndescrpt, nnei, + force_tensor.mutable_data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + nlist_tensor.data()); + })); + + return {force_tensor}; +} + +std::vector ProdForceSeAOpCUDAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel); + +std::vector ProdForceSeAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + if (net_deriv_tensor.place() == paddle::GPUPlace()) { + return ProdForceSeAOpCUDAForward(net_deriv_tensor, in_deriv_tensor, + nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, + n_r_sel); + } else if (net_deriv_tensor.place() == paddle::CPUPlace()) { + return ProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, + nlist_tensor, natoms_tensor, n_a_sel, + n_r_sel); + } else { + PD_THROW("No Such kernel for ProdForceSeAForward."); + } +} + +std::vector> ProdForceSeAInferShape( + std::vector net_deriv_shape, + std::vector in_deriv_shape, + std::vector nlist_shape, + std::vector natoms_shape, + const int& n_a_sel, + const int& n_r_sel) { + int64_t nall = /*natoms[1]*/ 192; + int64_t nframes = net_deriv_shape[0]; + std::vector force_shape = {nframes, 3 * nall}; + return {force_shape}; +} + +std::vector ProdForceSeAInferDtype( + paddle::DataType net_deriv_dtype, + paddle::DataType in_deriv_dtype, + paddle::DataType nlist_dtype, + paddle::DataType natoms_dtype) { + return {net_deriv_dtype}; +} + +PD_BUILD_OP(prod_force_se_a) + .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"}) + .Outputs({"force"}) + .Attrs({"n_a_sel: int", "n_r_sel: int"}) + .SetKernelFn(PD_KERNEL(ProdForceSeAForward)) + .SetInferShapeFn(PD_INFER_SHAPE(ProdForceSeAInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(ProdForceSeAInferDtype)); diff --git a/source/lib/paddle_src/prod_force.cu b/source/lib/paddle_src/paddle_prod_force.cu similarity index 65% rename from source/lib/paddle_src/prod_force.cu rename to source/lib/paddle_src/paddle_prod_force.cu index 4416cef082..a767ce6f3f 100644 --- a/source/lib/paddle_src/prod_force.cu +++ b/source/lib/paddle_src/paddle_prod_force.cu @@ -1,11 +1,12 @@ -#include "paddle/extension.h" - #include "device.h" -#include "prod_force.h" #include "gpu_cuda.h" +#include "paddle/extension.h" +#include "prod_force.h" #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") template __global__ void force_deriv_wrt_center_atom(FPTYPE* force, @@ -180,33 +181,33 @@ template void prod_force_r_gpu_cuda(double* force, const int nnei); } // namespace deepmd - template -void PdProdForceSeAOpForwardCUDAKernel( - int nloc, int nall, int nframes, int ndescrpt, int nnei, - data_t* p_force, const data_t* p_net_deriv, const data_t* p_in_deriv, const int* p_nlist -) { - for(int kk = 0; kk < nframes; ++kk){ - data_t * force = p_force + kk * nall * 3; - const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt; - const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int * nlist = p_nlist + kk * nloc * nnei; - deepmd::prod_force_a_gpu_cuda( - force, - net_deriv, in_deriv, nlist, nloc, nall, nnei - ); +void ProdForceSeAOpForwardCUDAKernel(int nloc, + int nall, + int nframes, + int ndescrpt, + int nnei, + data_t* p_force, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const int* p_nlist) { + for (int kk = 0; kk < nframes; ++kk) { + data_t* force = p_force + kk * nall * 3; + const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_force_a_gpu_cuda(force, net_deriv, in_deriv, nlist, nloc, nall, + nnei); } } - -std::vector PdProdForceSeAOpCUDAForward( - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { +std::vector ProdForceSeAOpCUDAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { CHECK_INPUT(net_deriv_tensor); CHECK_INPUT(in_deriv_tensor); CHECK_INPUT(nlist_tensor); @@ -215,7 +216,10 @@ std::vector PdProdForceSeAOpCUDAForward( CHECK_INPUT_DIM(in_deriv_tensor, 2); CHECK_INPUT_DIM(natoms_tensor, 1); - PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3"); + CHECK_INPUT_ON_CPU(natoms_tensor); + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + const int* natoms = natoms_tensor.data(); int nloc = natoms[0]; int nall = natoms[1]; @@ -223,81 +227,34 @@ std::vector PdProdForceSeAOpCUDAForward( int ndescrpt = net_deriv_tensor.shape()[1] / nloc; int nnei = nlist_tensor.shape()[1] / nloc; - PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match"); - PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match"); - PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match"); + PD_CHECK(nframes == in_deriv_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nframes == nlist_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], + "number of descriptors should match"); - std::vector force_shape {nframes, 3 * nall}; - paddle::Tensor force_tensor = paddle::Tensor(paddle::PlaceType::kGPU, force_shape); + std::vector force_shape{nframes, 3 * nall}; + paddle::Tensor force_tensor = paddle::empty( + force_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place()); - assert (nframes == force_shape[0]); - assert (nframes == net_deriv_tensor.shape()[0]); - assert (nframes == in_deriv_tensor.shape()[0]); - assert (nframes == nlist_tensor.shape()[0]); - assert (nall * 3 == force_shape[1]); - assert (nloc * ndescrpt == net_deriv_tensor.shape()[1]); - assert (nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]); - assert (nloc * nnei == nlist_tensor.shape()[1]); - assert (nnei * 4 == ndescrpt); + assert(nframes == force_shape[0]); + assert(nframes == net_deriv_tensor.shape()[0]); + assert(nframes == in_deriv_tensor.shape()[0]); + assert(nframes == nlist_tensor.shape()[0]); + assert(nall * 3 == force_shape[1]); + assert(nloc * ndescrpt == net_deriv_tensor.shape()[1]); + assert(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1]); + assert(nloc * nnei == nlist_tensor.shape()[1]); + assert(nnei * 4 == ndescrpt); PD_DISPATCH_FLOATING_TYPES( - net_deriv_tensor.type(), "pd_prod_force_se_a_cpu_forward_kernel", ([&] { - PdProdForceSeAOpForwardCUDAKernel( - nloc, nall, nframes, ndescrpt, nnei, - force_tensor.mutable_data(), net_deriv_tensor.data(), - in_deriv_tensor.data(), nlist_tensor.data()); - })); + net_deriv_tensor.type(), "prod_force_se_a_gpu_forward_kernel", ([&] { + ProdForceSeAOpForwardCUDAKernel( + nloc, nall, nframes, ndescrpt, nnei, force_tensor.data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + nlist_tensor.data()); + })); return {force_tensor}; } - - -std::vector PdProdForceSeAForward( - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { - // if(net_deriv_tensor.place() == paddle::PlaceType::kCPU){ - // return PdProdForceSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel); - // }else if(net_deriv_tensor.place() == paddle::PlaceType::kGPU){ - return PdProdForceSeAOpCUDAForward(net_deriv_tensor, in_deriv_tensor, nlist_tensor, natoms_tensor, n_a_sel, n_r_sel); - // }else{ - // PD_THROW("No Such kernel for PdFrodForceSeAForward!"); - // } -} - -std::vector> PdProdForceSeAInferShape( - std::vector net_deriv_shape, - std::vector in_deriv_shape, - std::vector nlist_shape, - std::vector natoms_shape, - const int &n_a_sel, - const int &n_r_sel -) { - // int64_t nloc = /*natoms[0]*/ 192; - int64_t nall = /*natoms[1]*/ 192; - int64_t nframes = net_deriv_shape[0]; - std::vector force_shape = {nframes, 3 * nall}; - return {force_shape}; -} - -std::vector PdProdForceSeAInferDtype( - paddle::DataType net_deriv_dtype, - paddle::DataType in_deriv_dtype, - paddle::DataType nlist_dtype, - paddle::DataType natoms_dtype -) { - return {net_deriv_dtype}; -} - - -PD_BUILD_OP(prod_force_se_a) - .Inputs({"net_deriv", "in_deriv", "nlist", "natoms"}) - .Outputs({"force"}) - .Attrs({"n_a_sel: int", "n_r_sel: int"}) - .SetKernelFn(PD_KERNEL(PdProdForceSeAForward)) - .SetInferShapeFn(PD_INFER_SHAPE(PdProdForceSeAInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(PdProdForceSeAInferDtype)); diff --git a/source/lib/paddle_src/paddle_prod_force_grad.cc b/source/lib/paddle_src/paddle_prod_force_grad.cc new file mode 100644 index 0000000000..f12c2c425d --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_force_grad.cc @@ -0,0 +1,141 @@ +#include "paddle/extension.h" +#include "prod_force_grad.h" + +#define CHECK_INPUT_READY(x) \ + PD_CHECK(x.initialized(), #x " must be initialized before usage.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") + +template +void ProdForceSeAOpCPUBackwardKernel(int nloc, + int nframes, + int ndescrpt, + int nnei, + const data_t* grad, + const data_t* net_deriv, + const data_t* in_deriv, + const int* nlist, + data_t* grad_net) { + // #pragma omp parallel for + // for (int kk = 0; kk < nframes; ++kk){ + // int grad_iter = kk * nloc * 3; + // int in_iter = kk * nloc * ndescrpt * 3; + // int nlist_iter = kk * nloc * nnei; + // int grad_net_iter = kk * nloc * ndescrpt; + + // deepmd::prod_force_grad_a_cpu( + // &grad_net[grad_net_iter], + // &grad[grad_iter], + // &in_deriv[in_iter], + // &nlist[nlist_iter], + // nloc, + // nnei + // ); + // } + + for (int kk = 0; kk < nframes; ++kk) { + data_t* p_grad_net = grad_net + kk * nloc * ndescrpt; + const data_t* p_grad = grad + kk * nloc * 3; + const data_t* p_in_deriv = in_deriv + kk * nloc * ndescrpt * 3; + const int* p_nlist = nlist + kk * nloc * nnei; + + deepmd::prod_force_grad_a_cpu(p_grad_net, p_grad, p_in_deriv, p_nlist, nloc, + nnei); + } +} + +std::vector ProdForceSeAOpCPUBackward( + const paddle::Tensor& grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT_READY(grad_tensor); + CHECK_INPUT_READY(net_deriv_tensor); + CHECK_INPUT_READY(in_deriv_tensor); + CHECK_INPUT_READY(nlist_tensor); + CHECK_INPUT_READY(natoms_tensor); + + auto grad_shape = grad_tensor.shape(); + auto net_deriv_shape = net_deriv_tensor.shape(); + auto in_deriv_shape = in_deriv_tensor.shape(); + auto nlist_shape = nlist_tensor.shape(); + auto natoms_shape = natoms_tensor.shape(); + + CHECK_INPUT_DIM(grad_tensor, 2); + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_shape[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + const int* natoms = natoms_tensor.data(); + + int nframes = net_deriv_shape[0]; + int nloc = natoms[0]; + int ndescrpt = net_deriv_shape[1] / nloc; + int nnei = nlist_shape[1] / nloc; + + PD_CHECK(nframes == grad_shape[0], "number of frames should match"); + PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); + PD_CHECK(nframes == nlist_shape[0], "number of samples should match"); + PD_CHECK((nloc * 3) == grad_shape[1], + "input grad shape should be 3 x natoms"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], + "number of descriptors should match"); + PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); + + std::vector grad_net_shape{nframes, (int64_t)nloc * ndescrpt}; + + paddle::Tensor grad_net_tensor = + paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place()); + + PD_DISPATCH_FLOATING_TYPES( + grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] { + ProdForceSeAOpCPUBackwardKernel( + nloc, nframes, ndescrpt, nnei, grad_tensor.data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + nlist_tensor.data(), grad_net_tensor.data()); + })); + return {grad_net_tensor}; +} + +std::vector ProdForceSeAOpCUDABackward( + const paddle::Tensor& force_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel); + +std::vector ProdForceSeABackward( + const paddle::Tensor& force_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + if (net_deriv_tensor.place() == paddle::GPUPlace()) { + return ProdForceSeAOpCUDABackward(force_grad_tensor, net_deriv_tensor, + in_deriv_tensor, nlist_tensor, + natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel); + } else if (net_deriv_tensor.place() == paddle::CPUPlace()) { + return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor, + in_deriv_tensor, nlist_tensor, + natoms_tensor, n_a_sel, n_r_sel); + } else { + PD_THROW("No Such kernel for ProdForceSeABackward."); + } +} + +PD_BUILD_GRAD_OP(prod_force_se_a) + .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"}) + .Outputs({paddle::Grad("net_deriv")}) + .Attrs({"n_a_sel: int", "n_r_sel: int"}) + .SetKernelFn(PD_KERNEL(ProdForceSeABackward)); diff --git a/source/lib/paddle_src/prod_force_grad.cu b/source/lib/paddle_src/paddle_prod_force_grad.cu similarity index 59% rename from source/lib/paddle_src/prod_force_grad.cu rename to source/lib/paddle_src/paddle_prod_force_grad.cu index a1dad3dc3c..0504f115e6 100644 --- a/source/lib/paddle_src/prod_force_grad.cu +++ b/source/lib/paddle_src/paddle_prod_force_grad.cu @@ -1,11 +1,12 @@ -#include "paddle/extension.h" - #include "device.h" -#include "prod_force_grad.h" #include "gpu_cuda.h" +#include "paddle/extension.h" +#include "prod_force_grad.h" -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") #define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") +#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") template __device__ inline FPTYPE dev_dot(const FPTYPE* arr1, const FPTYPE* arr2) { @@ -158,118 +159,100 @@ template void prod_force_grad_r_gpu_cuda(double* grad_net, } // namespace deepmd template -void PdProdForceSeAOpCUDABackwardKernel( - int nloc, int nframes, int ndescrpt, int nnei, - const data_t* p_grad, const data_t* p_net_deriv, const data_t* p_in_deriv, - const int* p_nlist, data_t* p_grad_net -) { - for (int_64 kk = 0; kk < nframes; ++kk) { - data_t* grad_net = p_grad_net + kk * nloc * ndescrpt; - const data_t* grad = p_grad + kk * nloc * 3; - const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - deepmd::prod_force_grad_a_gpu_cuda( - grad_net, grad, in_deriv, nlist, nloc, nnei - ); - } +void ProdForceSeAOpCUDABackwardKernel(int nloc, + int nframes, + int ndescrpt, + int nnei, + const data_t* p_grad, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const int* p_nlist, + data_t* p_grad_net) { + for (int_64 kk = 0; kk < nframes; ++kk) { + data_t* grad_net = p_grad_net + kk * nloc * ndescrpt; + const data_t* grad = p_grad + kk * nloc * 3; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_force_grad_a_gpu_cuda(grad_net, grad, in_deriv, nlist, nloc, + nnei); + } } +std::vector ProdForceSeAOpCUDABackward( + const paddle::Tensor& force_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + auto grad_shape = force_grad_tensor.shape(); + auto net_deriv_shape = net_deriv_tensor.shape(); + auto in_deriv_shape = in_deriv_tensor.shape(); + auto nlist_shape = nlist_tensor.shape(); + auto natoms_shape = natoms_tensor.shape(); -std::vector PdProdForceSeAOpCUDABackward( - const paddle::Tensor& force_grad_tensor, - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { - auto grad_shape = force_grad_tensor.shape(); - auto net_deriv_shape = net_deriv_tensor.shape(); - auto in_deriv_shape = in_deriv_tensor.shape(); - auto nlist_shape = nlist_tensor.shape(); - auto natoms_shape = natoms_tensor.shape(); + CHECK_INPUT_DIM(force_grad_tensor, 2); + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); - CHECK_INPUT_DIM(force_grad_tensor, 2); - CHECK_INPUT_DIM(net_deriv_tensor, 2); - CHECK_INPUT_DIM(in_deriv_tensor, 2); - CHECK_INPUT_DIM(nlist_tensor, 2); - CHECK_INPUT_DIM(natoms_tensor, 1); + PD_CHECK(natoms_shape[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); - PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3"); + CHECK_INPUT_ON_CPU(natoms_tensor); + const int* natoms = natoms_tensor.data(); + int nframes = net_deriv_shape[0]; + int nloc = natoms[0]; + int ndescrpt = net_deriv_shape[1] / nloc; + int nnei = nlist_shape[1] / nloc; - const int* natoms = nullptr; - // if(natoms_tensor.place() != paddle::PlaceType::kCPU){ - // natoms = natoms_tensor.copy_to(paddle::PlaceType::kCPU).data(); - // }else{ - natoms = natoms_tensor.data(); - // } - int nframes = net_deriv_shape[0]; - int nloc = natoms[0]; - int ndescrpt = net_deriv_shape[1] / nloc; - int nnei = nlist_shape[1] / nloc; + PD_CHECK(nframes == grad_shape[0], "number of frames should match"); + PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); + PD_CHECK(nframes == nlist_shape[0], "number of samples should match"); + PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], + "number of descriptors should match"); + PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); - PD_CHECK(nframes == grad_shape[0], "number of frames should match"); - PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); - PD_CHECK(nframes == nlist_shape[0],"number of samples should match"); - PD_CHECK(nloc * 3 == grad_shape[1], "input grad shape should be 3 x natoms"); - PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match"); - PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); + std::vector grad_net_shape{nframes, nloc * ndescrpt}; + paddle::Tensor grad_net_tensor = paddle::empty( + grad_net_shape, force_grad_tensor.dtype(), force_grad_tensor.place()); - std::vector grad_net_shape {nframes, nloc * ndescrpt}; - // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape); - paddle::Tensor grad_net_tensor = paddle::empty( - grad_net_shape, - force_grad_tensor.dtype(), - force_grad_tensor.place() - ); - - // if(force_grad_tensor.place() == paddle::PlaceType::kCPU){ - // PD_DISPATCH_FLOATING_TYPES( - // force_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] { - // PdProdForceSeAOpCPUBackwardKernel( - // nloc, nframes, ndescrpt, nnei, - // force_grad_tensor.data(), - // net_deriv_tensor.data(), - // in_deriv_tensor.data(), - // nlist_tensor.data(), - // grad_net_tensor.mutable_data()); - // })); - // }else{ - PD_DISPATCH_FLOATING_TYPES( - force_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] { - PdProdForceSeAOpCUDABackwardKernel( - nloc, nframes, ndescrpt, nnei, - force_grad_tensor.data(), - net_deriv_tensor.data(), - in_deriv_tensor.data(), - nlist_tensor.data(), - grad_net_tensor.mutable_data()); + PD_DISPATCH_FLOATING_TYPES( + force_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] { + ProdForceSeAOpCUDABackwardKernel( + nloc, nframes, ndescrpt, nnei, force_grad_tensor.data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + nlist_tensor.data(), grad_net_tensor.data()); })); - // } - return {grad_net_tensor}; + return {grad_net_tensor}; } +// std::vector ProdForceSeABackward( +// const paddle::Tensor& force_grad_tensor, +// const paddle::Tensor& net_deriv_tensor, +// const paddle::Tensor& in_deriv_tensor, +// const paddle::Tensor& nlist_tensor, +// const paddle::Tensor& natoms_tensor, +// int n_a_sel, +// int n_r_sel) { +// if (net_deriv_tensor.place() == paddle::GPUPlace()) { +// return ProdForceSeAOpCUDABackward(force_grad_tensor, net_deriv_tensor, +// in_deriv_tensor, nlist_tensor, +// natoms_tensor, n_a_sel, n_r_sel); +// } +// else if (net_deriv_tensor.place() == paddle::CPUPlace()) { +// return ProdForceSeAOpCPUBackward(force_grad_tensor, net_deriv_tensor, +// in_deriv_tensor, nlist_tensor, +// natoms_tensor, n_a_sel, n_r_sel); +// } else { +// PD_THROW("No Such kernel for ProdForceSeABackward."); +// } +// } -std::vector PdProdForceSeABackward( - const paddle::Tensor& force_grad_tensor, - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { - return PdProdForceSeAOpCUDABackward( - force_grad_tensor, net_deriv_tensor, in_deriv_tensor, - nlist_tensor, natoms_tensor, n_a_sel, n_r_sel - ); -} - - -PD_BUILD_GRAD_OP(prod_force_se_a) - .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", "natoms"}) - .Outputs({paddle::Grad("net_deriv")}) - .Attrs({"n_a_sel: int", "n_r_sel: int"}) - .SetKernelFn(PD_KERNEL(PdProdForceSeABackward)); - +// PD_BUILD_GRAD_OP(prod_force_se_a) +// .Inputs({paddle::Grad("force"), "net_deriv", "in_deriv", "nlist", +// "natoms"}) .Outputs({paddle::Grad("net_deriv")}) .Attrs({"n_a_sel: int", +// "n_r_sel: int"}) .SetKernelFn(PD_KERNEL(ProdForceSeABackward)); diff --git a/source/lib/paddle_src/paddle_prod_virial.cc b/source/lib/paddle_src/paddle_prod_virial.cc new file mode 100644 index 0000000000..299f042694 --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_virial.cc @@ -0,0 +1,155 @@ +#include "paddle/extension.h" +#include "prod_virial.h" + +#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_READY(x) \ + PD_CHECK(x.initialized(), #x " must be initialized before usage.") + +template +void ProdVirialSeAOpForwardCPUKernel(int nloc, + int nall, + int ndescrpt, + int nnei, + int nframes, + data_t* p_virial, + data_t* p_atom_virial, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const data_t* p_rij, + const int* p_nlist) { + for (int kk = 0; kk < nframes; ++kk) { + data_t* virial = p_virial + kk * 9; + data_t* atom_virial = p_atom_virial + kk * nall * 9; + const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const data_t* rij = p_rij + kk * nloc * nnei * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_virial_a_cpu(virial, atom_virial, net_deriv, in_deriv, rij, + nlist, nloc, nall, nnei); + } +} + +std::vector ProdVirialSeAOpCPUForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT(net_deriv_tensor); + CHECK_INPUT(in_deriv_tensor); + CHECK_INPUT(rij_tensor); + CHECK_INPUT(nlist_tensor); + CHECK_INPUT(natoms_tensor); + + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(rij_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + const int* natoms = natoms_tensor.data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int nnei = nlist_tensor.shape()[1] / nloc; + int nframes = net_deriv_tensor.shape()[0]; + int ndescrpt = net_deriv_tensor.shape()[1] / nloc; + + PD_CHECK(nframes == in_deriv_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match"); + PD_CHECK(nframes == nlist_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], + "number of descriptors should match"); + PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], + "dim of rij should be nnei * 3"); + + std::vector virial_shape{nframes, 9}; + std::vector atom_virial_shape{nframes, 9 * nall}; + paddle::Tensor virial_tensor = paddle::empty( + virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place()); + paddle::Tensor atom_virial_tensor = paddle::empty( + atom_virial_shape, net_deriv_tensor.dtype(), net_deriv_tensor.place()); + + PD_DISPATCH_FLOATING_TYPES( + net_deriv_tensor.type(), "prod_virial_se_a_cpu_forward_kernel", ([&] { + ProdVirialSeAOpForwardCPUKernel( + nloc, nall, ndescrpt, nnei, nframes, virial_tensor.data(), + atom_virial_tensor.data(), net_deriv_tensor.data(), + in_deriv_tensor.data(), rij_tensor.data(), + nlist_tensor.data()); + })); + + return {virial_tensor, atom_virial_tensor}; +} + +std::vector ProdVirialSeAOpCUDAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel); + +std::vector ProdVirialSeAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + if (net_deriv_tensor.is_gpu()) { + return ProdVirialSeAOpCUDAForward( + net_deriv_tensor, in_deriv_tensor, rij_tensor, nlist_tensor, + natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel); + } else if (net_deriv_tensor.is_cpu()) { + return ProdVirialSeAOpCPUForward(net_deriv_tensor, in_deriv_tensor, + rij_tensor, nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), + n_a_sel, n_r_sel); + } else { + PD_THROW("Unsupported device type for ProdVirialSeAForward"); + } +} + +std::vector> ProdVirialSeAInferShape( + std::vector net_deriv_shape, + std::vector in_deriv_shape, + std::vector rij_shape, + std::vector nlist_shape, + std::vector natoms_shape, + const int& n_a_sel, + const int& n_r_sel) { + // int64_t nloc = /*natoms[0]*/ 192; + int64_t nall = /*natoms[1]*/ 192; + int64_t nframes = net_deriv_shape[0]; + + std::vector virial_shape = {nframes, 9}; + std::vector atom_virial_shape = {nframes, 9 * nall}; + + return {virial_shape, atom_virial_shape}; +} + +std::vector ProdVirialSeAInferDtype( + paddle::DataType net_deriv_dtype, + paddle::DataType in_deriv_dtype, + paddle::DataType rij_dtype, + paddle::DataType nlist_dtype, + paddle::DataType natoms_dtype) { + return {net_deriv_dtype, net_deriv_dtype}; +} + +PD_BUILD_OP(prod_virial_se_a) + .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"}) + .Outputs({"virial", "atom_virial"}) + .Attrs({"n_a_sel: int", "n_r_sel: int"}) + .SetKernelFn(PD_KERNEL(ProdVirialSeAForward)) + .SetInferShapeFn(PD_INFER_SHAPE(ProdVirialSeAInferShape)) + .SetInferDtypeFn(PD_INFER_DTYPE(ProdVirialSeAInferDtype)); diff --git a/source/lib/paddle_src/paddle_prod_virial.cu b/source/lib/paddle_src/paddle_prod_virial.cu new file mode 100644 index 0000000000..23af9bbd75 --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_virial.cu @@ -0,0 +1,255 @@ +#include +#include +#include + +#include "device.h" +#include "gpu_cuda.h" +#include "paddle/extension.h" +#include "prod_virial.h" + +#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") +#define CHECK_INPUT_ON_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_READY(x) \ + PD_CHECK(x.initialized(), #x " must be initialized before usage.") + +template +__global__ void atom_virial_reduction(FPTYPE* virial, + const FPTYPE* atom_virial, + const int nall) { + unsigned int bid = blockIdx.x; + unsigned int tid = threadIdx.x; + __shared__ FPTYPE data[THREADS_PER_BLOCK]; + data[tid] = (FPTYPE)0.; + for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { + data[tid] += atom_virial[ii * 9 + bid]; + } + __syncthreads(); + // do reduction in shared memory + for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { + if (tid < ii) { + data[tid] += data[tid + ii]; + } + __syncthreads(); + } + // write result for this block to global memory + if (tid == 0) virial[bid] = data[0]; +} + +template +__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { + // idx -> nloc + // idy -> nnei + // idz = dd0 * 3 + dd1 + // dd0 = idz / 3 + // dd1 = idz % 3 + const int_64 idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; + const unsigned int idz = threadIdx.y; + const int ndescrpt = nnei * 4; + if (idy >= nnei) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + // atomicAdd( + // virial + idz, + // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % + // 3]); + FPTYPE virial_tmp = (FPTYPE)0.; + for (int idw = 0; idw < 4; ++idw) { + virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * + rij[idx * nnei * 3 + idy * 3 + idz % 3] * + in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; + } + atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp); +} + +template +__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { + // idx -> nloc + // idy -> nnei + // idz = dd0 * 3 + dd1 + // dd0 = idz / 3 + // dd1 = idz % 3 + const int_64 idx = blockIdx.x; + const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; + const unsigned int idz = threadIdx.y; + const int ndescrpt = nnei * 1; + + if (idy >= nnei) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + // atomicAdd( + // virial + idz, + // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 + // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % + // 3]); + atomicAdd(atom_virial + j_idx * 9 + idz, + net_deriv[idx * ndescrpt + idy] * + rij[idx * nnei * 3 + idy * 3 + idz % 3] * + in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); +} + +namespace deepmd { +template +void prod_virial_a_gpu_cuda(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { + DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); + DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(LEN, 9); + // compute virial of a frame + virial_deriv_wrt_neighbors_a<<>>( + virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); + // reduction atom_virial to virial + atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + +template +void prod_virial_r_gpu_cuda(FPTYPE* virial, + FPTYPE* atom_virial, + const FPTYPE* net_deriv, + const FPTYPE* in_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nall, + const int nnei) { + DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); + DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); + + const int LEN = 16; + int nblock = (nnei + LEN - 1) / LEN; + dim3 block_grid(nloc, nblock); + dim3 thread_grid(LEN, 9); + // compute virial of a frame + virial_deriv_wrt_neighbors_r<<>>( + virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); + // reduction atom_virial to virial + atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} +} // namespace deepmd + +template +void ProdVirialSeAOpForwardCUDAKernel(int nloc, + int nall, + int ndescrpt, + int nnei, + int nframes, + data_t* p_virial, + data_t* p_atom_virial, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const data_t* p_rij, + const int* p_nlist) { + for (int kk = 0; kk < nframes; ++kk) { + data_t* virial = p_virial + kk * 9; + data_t* atom_virial = p_atom_virial + kk * nall * 9; + const data_t* net_deriv = p_net_deriv + kk * nloc * ndescrpt; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const data_t* rij = p_rij + kk * nloc * nnei * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_virial_a_gpu_cuda(virial, atom_virial, net_deriv, in_deriv, + rij, nlist, nloc, nall, nnei); + } +} + +std::vector ProdVirialSeAOpCUDAForward( + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT(net_deriv_tensor); + CHECK_INPUT(in_deriv_tensor); + CHECK_INPUT(rij_tensor); + CHECK_INPUT(nlist_tensor); + CHECK_INPUT_ON_CPU(natoms_tensor); // TODO: + // 暂时指定python端必须为cpu,gpu的copy_to会导致返回的指针数据不对 + + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(rij_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + const int* natoms = natoms_tensor.data(); + int nloc = natoms[0]; + int nall = natoms[1]; + int nnei = nlist_tensor.shape()[1] / nloc; + int nframes = net_deriv_tensor.shape()[0]; + int ndescrpt = net_deriv_tensor.shape()[1] / nloc; + PD_CHECK(nframes == in_deriv_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match"); + PD_CHECK(nframes == nlist_tensor.shape()[0], + "number of samples should match"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], + "number of descriptors should match"); + PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], + "dim of rij should be nnei * 3"); + + std::vector virial_shape{nframes, 9}; + std::vector atom_virial_shape{nframes, 9 * nall}; + paddle::Tensor virial_tensor = + paddle::Tensor(paddle::PlaceType::kGPU, virial_shape); + paddle::Tensor atom_virial_tensor = + paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape); + + PD_DISPATCH_FLOATING_TYPES( + net_deriv_tensor.type(), "prod_virial_se_a_cuda_forward_kernel", ([&] { + ProdVirialSeAOpForwardCUDAKernel( + nloc, nall, ndescrpt, nnei, nframes, + virial_tensor.mutable_data(), + atom_virial_tensor.mutable_data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + rij_tensor.data(), nlist_tensor.data()); + })); + + return {virial_tensor, atom_virial_tensor}; +} diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cc b/source/lib/paddle_src/paddle_prod_virial_grad.cc new file mode 100644 index 0000000000..7bb442b694 --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_virial_grad.cc @@ -0,0 +1,147 @@ +#include "paddle/extension.h" +#include "prod_virial_grad.h" + +#define CHECK_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a GPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_READY(x) \ + PD_CHECK(x.initialized(), #x " must be initialized before usage.") + +template +void ProdVirialSeAOpCPUBackwardKernel(int nloc, + int nframes, + int ndescrpt, + int nnei, + const data_t* p_grad, + const data_t* p_net_deriv, + const data_t* p_in_deriv, + const data_t* p_rij, + const int* p_nlist, + data_t* p_grad_net) { + // #pragma omp parallel for + // for (int kk = 0; kk < nframes; ++kk) { + // int grad_iter = kk * 9; + // int in_iter = kk * nloc * ndescrpt * 3; + // int rij_iter = kk * nloc * nnei * 3; + // int nlist_iter = kk * nloc * nnei; + // int grad_net_iter = kk * nloc * ndescrpt; + + // deepmd::prod_virial_grad_a_cpu(&grad_net[grad_net_iter], + // &grad[grad_iter], + // &in_deriv[in_iter], &rij[rij_iter], + // &nlist[nlist_iter], nloc, nnei); + // } + + for (int kk = 0; kk < nframes; ++kk) { + data_t* grad_net = p_grad_net + kk * nloc * ndescrpt; + const data_t* grad = p_grad + kk * 9; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const data_t* rij = p_rij + kk * nloc * nnei * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_virial_grad_a_cpu(grad_net, grad, in_deriv, rij, nlist, nloc, + nnei); + } +} + +std::vector ProdVirialSeAOpCPUBackward( + const paddle::Tensor& grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT_READY(grad_tensor); + CHECK_INPUT_READY(net_deriv_tensor); + CHECK_INPUT_READY(in_deriv_tensor); + CHECK_INPUT_READY(rij_tensor); + CHECK_INPUT_READY(nlist_tensor); + CHECK_INPUT_READY(natoms_tensor); + + auto grad_shape = grad_tensor.shape(); + auto net_deriv_shape = net_deriv_tensor.shape(); + auto in_deriv_shape = in_deriv_tensor.shape(); + auto rij_shape = rij_tensor.shape(); + auto nlist_shape = nlist_tensor.shape(); + + CHECK_INPUT_DIM(grad_tensor, 2); + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(rij_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_tensor.shape()[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + + const int* natoms = natoms_tensor.data(); + + int nframes = net_deriv_shape[0]; + int nloc = natoms[0]; + int ndescrpt = net_deriv_shape[1] / nloc; + int nnei = nlist_shape[1] / nloc; + + PD_CHECK(nframes == grad_shape[0], "number of frames should match"); + PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); + PD_CHECK(nframes == rij_shape[0], "number of frames should match"); + PD_CHECK(nframes == nlist_shape[0], "number of samples should match"); + PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], + "number of descriptors should match"); + PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be nnei * 3"); + PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); + + std::vector grad_net_shape{nframes, (int64_t)nloc * ndescrpt}; + paddle::Tensor grad_net_tensor = + paddle::empty(grad_net_shape, grad_tensor.dtype(), grad_tensor.place()); + + PD_DISPATCH_FLOATING_TYPES( + grad_tensor.type(), "prod_force_se_a_cpu_backward_kernel", ([&] { + ProdVirialSeAOpCPUBackwardKernel( + nloc, nframes, ndescrpt, nnei, grad_tensor.data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + rij_tensor.data(), nlist_tensor.data(), + grad_net_tensor.data()); + })); + return {grad_net_tensor}; +} + +std::vector ProdVirialSeAOpCUDABackward( + const paddle::Tensor& virial_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel); + +std::vector ProdVirialSeABackward( + const paddle::Tensor& virial_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + if (virial_grad_tensor.is_gpu()) { + return ProdVirialSeAOpCUDABackward( + virial_grad_tensor, net_deriv_tensor, in_deriv_tensor, rij_tensor, + nlist_tensor, natoms_tensor.copy_to(paddle::CPUPlace(), false), n_a_sel, n_r_sel); + } else if (virial_grad_tensor.is_cpu()) { + return ProdVirialSeAOpCPUBackward(virial_grad_tensor, net_deriv_tensor, + in_deriv_tensor, rij_tensor, nlist_tensor, + natoms_tensor, n_a_sel, n_r_sel); + } else { + PD_THROW("Unsupported device type for ProdVirialSeAForward"); + } +} + +PD_BUILD_GRAD_OP(prod_virial_se_a) + .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", + "natoms"}) + .Outputs({paddle::Grad("net_deriv")}) + .Attrs({"n_a_sel: int", "n_r_sel: int"}) + .SetKernelFn(PD_KERNEL(ProdVirialSeABackward)); diff --git a/source/lib/paddle_src/paddle_prod_virial_grad.cu b/source/lib/paddle_src/paddle_prod_virial_grad.cu new file mode 100644 index 0000000000..be0602e30e --- /dev/null +++ b/source/lib/paddle_src/paddle_prod_virial_grad.cu @@ -0,0 +1,191 @@ +#include +#include +#include + +#include "device.h" +#include "gpu_cuda.h" +#include "paddle/extension.h" +#include "prod_virial.h" + +#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") +#define CHECK_INPUT_CPU(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_INPUT_DIM(x, value) \ + PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") +#define CHECK_INPUT_READY(x) \ + PD_CHECK(x.initialized(), #x " must be initialized before usage.") + +template +__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) { + FPTYPE result = (FPTYPE)0.0; + for (int ii = 0; ii < 9; ii++) { + result += arr1[ii] * arr2[ii]; + } + return result; +} + +template +__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { + // idy -> nnei + const unsigned int tid = threadIdx.x; + const int_64 idx = blockIdx.x * blockDim.x + tid; + const unsigned int idy = blockIdx.y; + const unsigned int idw = threadIdx.y; + const int ndescrpt = nnei * 4; + __shared__ FPTYPE grad_one[9]; + if (tid < 9) { + grad_one[tid] = grad[tid]; + } + __syncthreads(); + if (idx >= nloc) { + return; + } + int j_idx = nlist[idx * nnei + idy]; + if (j_idx < 0) { + return; + } + FPTYPE tmp[9]; + for (int dd0 = 0; dd0 < 3; ++dd0) { + for (int dd1 = 0; dd1 < 3; ++dd1) { + tmp[dd0 * 3 + dd1] = + rij[idx * nnei * 3 + idy * 3 + dd1] * + env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0]; + } + } + grad_net[idx * ndescrpt + idy * 4 + idw] -= + (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); +} + +namespace deepmd { +template +void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, + const FPTYPE* grad, + const FPTYPE* env_deriv, + const FPTYPE* rij, + const int* nlist, + const int nloc, + const int nnei) { + const int ndescrpt = nnei * 4; + DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); + const int LEN = 128; + const int nblock = (nloc + LEN - 1) / LEN; + dim3 block_grid(nblock, nnei); + dim3 thread_grid(LEN, 4); + virial_grad_wrt_neighbors_a<<>>( + grad_net, grad, env_deriv, rij, nlist, nloc, nnei); + DPErrcheck(cudaGetLastError()); + DPErrcheck(cudaDeviceSynchronize()); +} + +template void prod_virial_grad_a_gpu_cuda(float* grad_net, + const float* grad, + const float* env_deriv, + const float* rij, + const int* nlist, + const int nloc, + const int nnei); +template void prod_virial_grad_a_gpu_cuda(double* grad_net, + const double* grad, + const double* env_deriv, + const double* rij, + const int* nlist, + const int nloc, + const int nnei); +} // namespace deepmd + +template +void ProdForceSeAOpGPUBackwardKernel(int nloc, + int nframes, + int ndescrpt, + int nnei, + const data_t* virial_grad, + const data_t* net_deriv, + const data_t* in_deriv, + const data_t* rij, + const int* nlist, + data_t* grad_net) { + data_t* p_grad_net = grad_net; + const data_t* p_grad = virial_grad; + const data_t* p_in_deriv = in_deriv; + const data_t* p_rij = rij; + const int* p_nlist = nlist; + for (int_64 kk = 0; kk < nframes; ++kk) { + data_t* grad_net = p_grad_net + kk * nloc * ndescrpt; + const data_t* virial_grad = p_grad + kk * 9; + const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; + const data_t* rij = p_rij + kk * nloc * nnei * 3; + const int* nlist = p_nlist + kk * nloc * nnei; + deepmd::prod_virial_grad_a_gpu_cuda(grad_net, virial_grad, in_deriv, rij, + nlist, nloc, nnei); + } +} + +std::vector ProdVirialSeAOpCUDABackward( + const paddle::Tensor& virial_grad_tensor, + const paddle::Tensor& net_deriv_tensor, + const paddle::Tensor& in_deriv_tensor, + const paddle::Tensor& rij_tensor, + const paddle::Tensor& nlist_tensor, + const paddle::Tensor& natoms_tensor, + int n_a_sel, + int n_r_sel) { + CHECK_INPUT_READY(virial_grad_tensor); + CHECK_INPUT_READY(net_deriv_tensor); + CHECK_INPUT_READY(in_deriv_tensor); + CHECK_INPUT_READY(rij_tensor); + CHECK_INPUT_READY(nlist_tensor); + CHECK_INPUT_READY(natoms_tensor); + + auto grad_shape = virial_grad_tensor.shape(); + auto net_deriv_shape = net_deriv_tensor.shape(); + auto in_deriv_shape = in_deriv_tensor.shape(); + auto rij_shape = rij_tensor.shape(); + auto nlist_shape = nlist_tensor.shape(); + auto natoms_shape = natoms_tensor.shape(); + + CHECK_INPUT_DIM(virial_grad_tensor, 2); + CHECK_INPUT_DIM(net_deriv_tensor, 2); + CHECK_INPUT_DIM(in_deriv_tensor, 2); + CHECK_INPUT_DIM(rij_tensor, 2); + CHECK_INPUT_DIM(nlist_tensor, 2); + CHECK_INPUT_DIM(natoms_tensor, 1); + + PD_CHECK(natoms_shape[0] >= 3, + "number of atoms should be larger than (or equal to) 3"); + + CHECK_INPUT_CPU(natoms_tensor); + const int* natoms = natoms_tensor.data(); + int nframes = net_deriv_shape[0]; + int nloc = natoms[0]; + int ndescrpt = net_deriv_shape[1] / nloc; + int nnei = nlist_shape[1] / nloc; + + PD_CHECK(nframes == grad_shape[0], "number of frames should match"); + PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); + PD_CHECK(nframes == rij_shape[0], "number of frames should match"); + PD_CHECK(nframes == nlist_shape[0], "number of samples should match"); + PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms"); + PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], + "number of descriptors should match"); + PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be nnei * 3"); + PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); + + std::vector grad_net_shape{nframes, nloc * ndescrpt}; + paddle::Tensor grad_net_tensor = paddle::empty( + grad_net_shape, virial_grad_tensor.dtype(), virial_grad_tensor.place()); + + PD_DISPATCH_FLOATING_TYPES( + virial_grad_tensor.type(), "prod_force_se_a_cuda_backward_kernel", ([&] { + ProdForceSeAOpGPUBackwardKernel( + nloc, nframes, ndescrpt, nnei, virial_grad_tensor.data(), + net_deriv_tensor.data(), in_deriv_tensor.data(), + rij_tensor.data(), nlist_tensor.data(), + grad_net_tensor.data()); + })); + return {grad_net_tensor}; +} diff --git a/source/lib/paddle_src/prod_env_mat.cc b/source/lib/paddle_src/prod_env_mat.cc deleted file mode 100644 index 7ebfd6cdc7..0000000000 --- a/source/lib/paddle_src/prod_env_mat.cc +++ /dev/null @@ -1,321 +0,0 @@ -#include "prod_env_mat.h" - -#include - -#include -#include - -#include "env_mat.h" -#include "fmt_nlist.h" - -using namespace deepmd; - -template -void deepmd::prod_env_mat_a_cpu(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type) { - if (f_type == NULL) { - f_type = type; - } - const int nnei = sec.back(); - const int nem = nnei * 4; - - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; - } - } - - // set type - std::vector d_f_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_f_type[ii] = f_type[ii]; - } - - // build nlist - std::vector > d_nlist_a(nloc); - - assert(nloc == inlist.inum); - for (unsigned ii = 0; ii < nloc; ++ii) { - d_nlist_a[ii].reserve(max_nbor_size); - } - for (unsigned ii = 0; ii < nloc; ++ii) { - int i_idx = inlist.ilist[ii]; - for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) { - int j_idx = inlist.firstneigh[ii][jj]; - d_nlist_a[i_idx].push_back(j_idx); - } - } - -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - std::vector fmt_nlist_a; - int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_f_type, ii, - d_nlist_a[ii], rcut, sec); - std::vector d_em_a; - std::vector d_em_a_deriv; - std::vector d_em_r; - std::vector d_em_r_deriv; - std::vector d_rij_a; - env_mat_a_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_f_type, ii, - fmt_nlist_a, sec, rcut_smth, rcut); - - // check sizes - assert(d_em_a.size() == nem); - assert(d_em_a_deriv.size() == nem * 3); - assert(d_rij_a.size() == nnei * 3); - assert(fmt_nlist_a.size() == nnei); - // record outputs - for (int jj = 0; jj < nem; ++jj) { - if (type[ii] >= 0) { - em[ii * nem + jj] = - (d_em_a[jj] - avg[type[ii] * nem + jj]) / std[type[ii] * nem + jj]; - } else { - em[ii * nem + jj] = 0; - } - } - for (int jj = 0; jj < nem * 3; ++jj) { - if (type[ii] >= 0) { - em_deriv[ii * nem * 3 + jj] = - d_em_a_deriv[jj] / std[type[ii] * nem + jj / 3]; - } else { - em_deriv[ii * nem * 3 + jj] = 0; - } - } - for (int jj = 0; jj < nnei * 3; ++jj) { - rij[ii * nnei * 3 + jj] = d_rij_a[jj]; - } - for (int jj = 0; jj < nnei; ++jj) { - nlist[ii * nnei + jj] = fmt_nlist_a[jj]; - } - } -} - -template -void deepmd::prod_env_mat_r_cpu(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec) { - const int nnei = sec.back(); - const int nem = nnei * 1; - - // set & normalize coord - std::vector d_coord3(nall * 3); - for (int ii = 0; ii < nall; ++ii) { - for (int dd = 0; dd < 3; ++dd) { - d_coord3[ii * 3 + dd] = coord[ii * 3 + dd]; - } - } - - // set type - std::vector d_type(nall); - for (int ii = 0; ii < nall; ++ii) { - d_type[ii] = type[ii]; - } - - // build nlist - std::vector > d_nlist_a(nloc); - - assert(nloc == inlist.inum); - for (unsigned ii = 0; ii < nloc; ++ii) { - d_nlist_a[ii].reserve(max_nbor_size); - } - for (unsigned ii = 0; ii < nloc; ++ii) { - int i_idx = inlist.ilist[ii]; - for (unsigned jj = 0; jj < inlist.numneigh[ii]; ++jj) { - int j_idx = inlist.firstneigh[ii][jj]; - d_nlist_a[i_idx].push_back(j_idx); - } - } - -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - std::vector fmt_nlist_a; - int ret = format_nlist_i_cpu(fmt_nlist_a, d_coord3, d_type, ii, - d_nlist_a[ii], rcut, sec); - std::vector d_em_a; - std::vector d_em_a_deriv; - std::vector d_em_r; - std::vector d_em_r_deriv; - std::vector d_rij_a; - env_mat_r_cpu(d_em_a, d_em_a_deriv, d_rij_a, d_coord3, d_type, ii, - fmt_nlist_a, sec, rcut_smth, rcut); - - // check sizes - assert(d_em_a.size() == nem); - assert(d_em_a_deriv.size() == nem * 3); - assert(d_rij_a.size() == nnei * 3); - assert(fmt_nlist_a.size() == nnei); - // record outputs - for (int jj = 0; jj < nem; ++jj) { - em[ii * nem + jj] = (d_em_a[jj] - avg[d_type[ii] * nem + jj]) / - std[d_type[ii] * nem + jj]; - } - for (int jj = 0; jj < nem * 3; ++jj) { - em_deriv[ii * nem * 3 + jj] = - d_em_a_deriv[jj] / std[d_type[ii] * nem + jj / 3]; - } - for (int jj = 0; jj < nnei * 3; ++jj) { - rij[ii * nnei * 3 + jj] = d_rij_a[jj]; - } - for (int jj = 0; jj < nnei; ++jj) { - nlist[ii * nnei + jj] = fmt_nlist_a[jj]; - } - } -} - -template void deepmd::prod_env_mat_a_cpu(double *em, - double *em_deriv, - double *rij, - int *nlist, - const double *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const double *avg, - const double *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type); - -template void deepmd::prod_env_mat_a_cpu(float *em, - float *em_deriv, - float *rij, - int *nlist, - const float *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const float *avg, - const float *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type); - -template void deepmd::prod_env_mat_r_cpu(double *em, - double *em_deriv, - double *rij, - int *nlist, - const double *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const double *avg, - const double *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); - -template void deepmd::prod_env_mat_r_cpu(float *em, - float *em_deriv, - float *rij, - int *nlist, - const float *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const float *avg, - const float *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); - -// #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -void deepmd::env_mat_nbor_update(InputNlist &inlist, - InputNlist &gpu_inlist, - int &max_nbor_size, - int *&nbor_list_dev, - const int *mesh, - const int size) { - int *mesh_host = new int[size]; - memcpy_device_to_host(mesh, mesh_host, size); - memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *)); - memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *)); - memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **)); - const int ago = mesh_host[0]; - if (ago == 0 || gpu_inlist.inum < inlist.inum) { - const int inum = inlist.inum; - if (gpu_inlist.inum < inum) { - delete_device_memory(gpu_inlist.ilist); - delete_device_memory(gpu_inlist.numneigh); - delete_device_memory(gpu_inlist.firstneigh); - malloc_device_memory(gpu_inlist.ilist, inum); - malloc_device_memory(gpu_inlist.numneigh, inum); - malloc_device_memory(gpu_inlist.firstneigh, inum); - } - memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum); - memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum); - int _max_nbor_size = max_numneigh(inlist); - if (_max_nbor_size <= 256) { - _max_nbor_size = 256; - } else if (_max_nbor_size <= 512) { - _max_nbor_size = 512; - } else if (_max_nbor_size <= 1024) { - _max_nbor_size = 1024; - } else if (_max_nbor_size <= 2048) { - _max_nbor_size = 2048; - } else { - _max_nbor_size = 4096; - } - if (nbor_list_dev == NULL || _max_nbor_size > max_nbor_size || - inum > gpu_inlist.inum) { - delete_device_memory(nbor_list_dev); - malloc_device_memory(nbor_list_dev, inum * _max_nbor_size); - } - // update info - gpu_inlist.inum = inum; - max_nbor_size = _max_nbor_size; - - // copy nbor list from host to the device - std::vector nbor_list_host(inum * max_nbor_size, 0); - int **_firstneigh = (int **)malloc(sizeof(int *) * inum); - for (int ii = 0; ii < inum; ii++) { - _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size; - for (int jj = 0; jj < inlist.numneigh[ii]; jj++) { - nbor_list_host[ii * max_nbor_size + jj] = inlist.firstneigh[ii][jj]; - } - } - memcpy_host_to_device(nbor_list_dev, &nbor_list_host[0], - inum * max_nbor_size); - memcpy_host_to_device(gpu_inlist.firstneigh, _firstneigh, inum); - free(_firstneigh); - } - delete[] mesh_host; -} -// #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/source/lib/paddle_src/prod_env_mat.h b/source/lib/paddle_src/prod_env_mat.h deleted file mode 100644 index 3052dd2230..0000000000 --- a/source/lib/paddle_src/prod_env_mat.h +++ /dev/null @@ -1,140 +0,0 @@ -#pragma once -#include - -#include "device.h" -#include "neighbor_list.h" - -namespace deepmd { - -template -void prod_env_mat_a_cpu(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type = NULL); - -template -void prod_env_mat_r_cpu(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &inlist, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); - -#if GOOGLE_CUDA -template -void prod_env_mat_a_gpu_cuda(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type = NULL); - -template -void prod_env_mat_r_gpu_cuda(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); - -void env_mat_nbor_update(InputNlist &inlist, - InputNlist &gpu_inlist, - int &max_nbor_size, - int *&nbor_list_dev, - const int *mesh, - const int size); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -template -void prod_env_mat_a_gpu_rocm(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec, - const int *f_type = NULL); - -template -void prod_env_mat_r_gpu_rocm(FPTYPE *em, - FPTYPE *em_deriv, - FPTYPE *rij, - int *nlist, - const FPTYPE *coord, - const int *type, - const InputNlist &gpu_inlist, - int *array_int, - unsigned long long *array_longlong, - const int max_nbor_size, - const FPTYPE *avg, - const FPTYPE *std, - const int nloc, - const int nall, - const float rcut, - const float rcut_smth, - const std::vector sec); - -void env_mat_nbor_update(InputNlist &inlist, - InputNlist &gpu_inlist, - int &max_nbor_size, - int *&nbor_list_dev, - const int *mesh, - const int size); -#endif // TENSORFLOW_USE_ROCM - -} // namespace deepmd diff --git a/source/lib/paddle_src/prod_virial.cc b/source/lib/paddle_src/prod_virial.cc deleted file mode 100644 index 8769ccf8f1..0000000000 --- a/source/lib/paddle_src/prod_virial.cc +++ /dev/null @@ -1,219 +0,0 @@ -#include "custom_op.h" - -REGISTER_OP("ProdVirial") - .Attr("T: {float, double} = DT_DOUBLE") - .Input("net_deriv: T") - .Input("in_deriv: T") - .Input("rij: T") - .Input("nlist: int32") - .Input("axis: int32") - .Input("natoms: int32") - .Attr("n_a_sel: int") - .Attr("n_r_sel: int") - .Output("virial: T") - .Output("atom_virial: T"); - -using namespace tensorflow; - -using CPUDevice = Eigen::ThreadPoolDevice; - -template -class ProdVirialOp : public OpKernel { - public: - explicit ProdVirialOp(OpKernelConstruction* context) : OpKernel(context) { - OP_REQUIRES_OK(context, context->GetAttr("n_a_sel", &n_a_sel)); - OP_REQUIRES_OK(context, context->GetAttr("n_r_sel", &n_r_sel)); - n_a_shift = n_a_sel * 4; - } - - void Compute(OpKernelContext* context) override { - deepmd::safe_compute( - context, [this](OpKernelContext* context) { this->_Compute(context); }); - } - - void _Compute(OpKernelContext* context) { - // Grab the input tensor - const Tensor& net_deriv_tensor = context->input(0); - const Tensor& in_deriv_tensor = context->input(1); - const Tensor& rij_tensor = context->input(2); - const Tensor& nlist_tensor = context->input(3); - const Tensor& axis_tensor = context->input(4); - const Tensor& natoms_tensor = context->input(5); - - // set size of the sample - OP_REQUIRES(context, (net_deriv_tensor.shape().dims() == 2), - errors::InvalidArgument("Dim of net deriv should be 2")); - OP_REQUIRES(context, (in_deriv_tensor.shape().dims() == 2), - errors::InvalidArgument("Dim of input deriv should be 2")); - OP_REQUIRES(context, (rij_tensor.shape().dims() == 2), - errors::InvalidArgument("Dim of rij should be 2")); - OP_REQUIRES(context, (nlist_tensor.shape().dims() == 2), - errors::InvalidArgument("Dim of nlist should be 2")); - OP_REQUIRES(context, (axis_tensor.shape().dims() == 2), - errors::InvalidArgument("Dim of axis should be 2")); - OP_REQUIRES(context, (natoms_tensor.shape().dims() == 1), - errors::InvalidArgument("Dim of natoms should be 1")); - - OP_REQUIRES(context, (natoms_tensor.shape().dim_size(0) >= 3), - errors::InvalidArgument( - "number of atoms should be larger than (or equal to) 3")); - auto natoms = natoms_tensor.flat(); - - int nframes = net_deriv_tensor.shape().dim_size(0); - int nloc = natoms(0); - int nall = natoms(1); - int ndescrpt = net_deriv_tensor.shape().dim_size(1) / nloc; - int nnei = nlist_tensor.shape().dim_size(1) / nloc; - - // check the sizes - OP_REQUIRES(context, (nframes == in_deriv_tensor.shape().dim_size(0)), - errors::InvalidArgument("number of samples should match")); - OP_REQUIRES(context, (nframes == rij_tensor.shape().dim_size(0)), - errors::InvalidArgument("number of samples should match")); - OP_REQUIRES(context, (nframes == nlist_tensor.shape().dim_size(0)), - errors::InvalidArgument("number of samples should match")); - OP_REQUIRES(context, (nframes == axis_tensor.shape().dim_size(0)), - errors::InvalidArgument("number of samples should match")); - - OP_REQUIRES(context, - (nloc * ndescrpt * 12 == in_deriv_tensor.shape().dim_size(1)), - errors::InvalidArgument("number of descriptors should match")); - OP_REQUIRES(context, (nloc * nnei * 3 == rij_tensor.shape().dim_size(1)), - errors::InvalidArgument("dim of rij should be nnei * 3")); - OP_REQUIRES(context, (nnei == n_a_sel + n_r_sel), - errors::InvalidArgument("number of neighbors should match")); - OP_REQUIRES( - context, (nloc * 4 == axis_tensor.shape().dim_size(1)), - errors::InvalidArgument("number of axis type+id should be 2+2")); - - // Create an output tensor - TensorShape virial_shape; - virial_shape.AddDim(nframes); - virial_shape.AddDim(9); - Tensor* virial_tensor = NULL; - OP_REQUIRES_OK(context, - context->allocate_output(0, virial_shape, &virial_tensor)); - TensorShape atom_virial_shape; - atom_virial_shape.AddDim(nframes); - atom_virial_shape.AddDim(9 * nall); - Tensor* atom_virial_tensor = NULL; - OP_REQUIRES_OK(context, context->allocate_output(1, atom_virial_shape, - &atom_virial_tensor)); - - // flat the tensors - auto net_deriv = net_deriv_tensor.flat(); - auto in_deriv = in_deriv_tensor.flat(); - auto rij = rij_tensor.flat(); - auto nlist = nlist_tensor.flat(); - auto axis = axis_tensor.flat(); - auto virial = virial_tensor->flat(); - auto atom_virial = atom_virial_tensor->flat(); - - // loop over samples -#pragma omp parallel for - for (int kk = 0; kk < nframes; ++kk) { - int net_iter = kk * nloc * ndescrpt; - int in_iter = kk * nloc * ndescrpt * 12; - int rij_iter = kk * nloc * nnei * 3; - int nlist_iter = kk * nloc * nnei; - int axis_iter = kk * nloc * 4; - int virial_iter = kk * 9; - int atom_virial_iter = kk * nall * 9; - - for (int ii = 0; ii < 9; ++ii) { - virial(virial_iter + ii) = 0.; - } - for (int ii = 0; ii < 9 * nall; ++ii) { - atom_virial(atom_virial_iter + ii) = 0.; - } - - // compute virial of a frame - for (int ii = 0; ii < nloc; ++ii) { - int i_idx = ii; - - // set axes - int axis0_type = axis(axis_iter + i_idx * 4 + 0); - int axis1_type = axis(axis_iter + i_idx * 4 + 2); - int axis_0 = axis(axis_iter + i_idx * 4 + 1); - int axis_1 = axis(axis_iter + i_idx * 4 + 3); - if (axis0_type == 1) axis_0 += n_a_sel; - if (axis1_type == 1) axis_1 += n_a_sel; - - // deriv wrt neighbors - for (int jj = 0; jj < nnei; ++jj) { - int j_idx = nlist(nlist_iter + i_idx * nnei + jj); - if (j_idx < 0) continue; - if (jj == axis_0) { - for (int aa = 0; aa < ndescrpt; ++aa) { - FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa); - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - FPTYPE tmp_v = - pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * - in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 3 + - dd0); - virial(virial_iter + dd0 * 3 + dd1) += tmp_v; - atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += - tmp_v; - } - } - } - } else if (jj == axis_1) { - for (int aa = 0; aa < ndescrpt; ++aa) { - FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa); - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - FPTYPE tmp_v = - pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * - in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 6 + - dd0); - virial(virial_iter + dd0 * 3 + dd1) += tmp_v; - atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += - tmp_v; - } - } - } - } else { - int aa_start, aa_end; - make_descript_range(aa_start, aa_end, jj); - for (int aa = aa_start; aa < aa_end; ++aa) { - FPTYPE pref = -1.0 * net_deriv(net_iter + i_idx * ndescrpt + aa); - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - FPTYPE tmp_v = - pref * rij(rij_iter + i_idx * nnei * 3 + jj * 3 + dd1) * - in_deriv(in_iter + i_idx * ndescrpt * 12 + aa * 12 + 9 + - dd0); - virial(virial_iter + dd0 * 3 + dd1) += tmp_v; - atom_virial(atom_virial_iter + j_idx * 9 + dd0 * 3 + dd1) += - tmp_v; - } - } - } - } - } - } - } - } - - private: - int n_r_sel, n_a_sel, n_a_shift; - inline void make_descript_range(int& idx_start, - int& idx_end, - const int& nei_idx) { - if (nei_idx < n_a_sel) { - idx_start = nei_idx * 4; - idx_end = nei_idx * 4 + 4; - } else { - idx_start = n_a_shift + (nei_idx - n_a_sel); - idx_end = n_a_shift + (nei_idx - n_a_sel) + 1; - } - } -}; - -#define REGISTER_CPU(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("ProdVirial").Device(DEVICE_CPU).TypeConstraint("T"), \ - ProdVirialOp); -REGISTER_CPU(float); -REGISTER_CPU(double); diff --git a/source/lib/paddle_src/prod_virial.cu b/source/lib/paddle_src/prod_virial.cu deleted file mode 100644 index fe7abee63b..0000000000 --- a/source/lib/paddle_src/prod_virial.cu +++ /dev/null @@ -1,496 +0,0 @@ -#include -#include -#include -#include "paddle/extension.h" - -#include "device.h" -#include "prod_virial.h" -#include "gpu_cuda.h" - -#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") -#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.") - -template -__global__ void atom_virial_reduction(FPTYPE* virial, - const FPTYPE* atom_virial, - const int nall) { - unsigned int bid = blockIdx.x; - unsigned int tid = threadIdx.x; - __shared__ FPTYPE data[THREADS_PER_BLOCK]; - data[tid] = (FPTYPE)0.; - for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { - data[tid] += atom_virial[ii * 9 + bid]; - } - __syncthreads(); - // do reduction in shared memory - for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { - if (tid < ii) { - data[tid] += data[tid + ii]; - } - __syncthreads(); - } - // write result for this block to global memory - if (tid == 0) virial[bid] = data[0]; -} - -template -__global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idx -> nloc - // idy -> nnei - // idz = dd0 * 3 + dd1 - // dd0 = idz / 3 - // dd1 = idz % 3 - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 4; - if (idy >= nnei) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - // atomicAdd( - // virial + idz, - // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 - // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % - // 3]); - FPTYPE virial_tmp = (FPTYPE)0.; - for (int idw = 0; idw < 4; ++idw) { - virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * - rij[idx * nnei * 3 + idy * 3 + idz % 3] * - in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; - } - atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp); -} - -template -__global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idx -> nloc - // idy -> nnei - // idz = dd0 * 3 + dd1 - // dd0 = idz / 3 - // dd1 = idz % 3 - const int_64 idx = blockIdx.x; - const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; - const unsigned int idz = threadIdx.y; - const int ndescrpt = nnei * 1; - - if (idy >= nnei) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - // atomicAdd( - // virial + idz, - // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 - // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % - // 3]); - atomicAdd(atom_virial + j_idx * 9 + idz, - net_deriv[idx * ndescrpt + idy] * - rij[idx * nnei * 3 + idy * 3 + idz % 3] * - in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); -} - -namespace deepmd { -template -void prod_virial_a_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - - const int LEN = 16; - int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(LEN, 9); - // compute virial of a frame - virial_deriv_wrt_neighbors_a<<>>( - virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - // reduction atom_virial to virial - atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); -} - -template -void prod_virial_r_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* in_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei) { - DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); - DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - - const int LEN = 16; - int nblock = (nnei + LEN - 1) / LEN; - dim3 block_grid(nloc, nblock); - dim3 thread_grid(LEN, 9); - // compute virial of a frame - virial_deriv_wrt_neighbors_r<<>>( - virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); - // reduction atom_virial to virial - atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); -} -} // namespace deepmd - -template -void PdProdVirialSeAOpForwardCUDAKernel( - int nloc, int nall, int ndescrpt, int nnei, int nframes, - data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv, - const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){ - - for(int kk = 0; kk < nframes; ++kk){ - data_t * virial = p_virial + kk * 9; - data_t * atom_virial = p_atom_virial + kk * nall * 9; - const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt; - const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const data_t * rij = p_rij + kk * nloc * nnei * 3; - const int * nlist = p_nlist + kk * nloc * nnei; - deepmd::prod_virial_a_gpu_cuda( - virial, atom_virial, - net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); - } -} - - -std::vector PdProdVirialSeAOpCUDAForward( -const paddle::Tensor& net_deriv_tensor, -const paddle::Tensor& in_deriv_tensor, -const paddle::Tensor& rij_tensor, -const paddle::Tensor& nlist_tensor, -const paddle::Tensor& natoms_tensor, -int n_a_sel, -int n_r_sel -){ - CHECK_INPUT(net_deriv_tensor); - CHECK_INPUT(in_deriv_tensor); - CHECK_INPUT(rij_tensor); - CHECK_INPUT(nlist_tensor); - // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu,gpu的copy_to会导致返回的指针数据不对 - - CHECK_INPUT_DIM(net_deriv_tensor, 2); - CHECK_INPUT_DIM(in_deriv_tensor, 2); - CHECK_INPUT_DIM(rij_tensor, 2); - CHECK_INPUT_DIM(nlist_tensor, 2); - CHECK_INPUT_DIM(natoms_tensor, 1); - - PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3"); - const int* natoms = natoms_tensor.data(); - // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel()); - int nloc = natoms[0]; - // printf("nloc = %d\n", nloc); - int nall = natoms[1]; - // printf("nall = %d\n", nall); - int nnei = nlist_tensor.shape()[1] / nloc; - int nframes = net_deriv_tensor.shape()[0]; - int ndescrpt = net_deriv_tensor.shape()[1] / nloc; - PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match"); - PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match"); - PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match"); - PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match"); - PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3"); - - std::vector virial_shape {nframes, 9}; - std::vector atom_virial_shape {nframes, 9 * nall}; - paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape); - paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape); - - PD_DISPATCH_FLOATING_TYPES( - net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] { - PdProdVirialSeAOpForwardCUDAKernel( - nloc, nall, ndescrpt, nnei, nframes, - virial_tensor.mutable_data(), atom_virial_tensor.mutable_data(), - net_deriv_tensor.data(), in_deriv_tensor.data(), - rij_tensor.data(), nlist_tensor.data()); - })); - - return {virial_tensor, atom_virial_tensor}; -} - -std::vector PdProdVirialSeAForward( - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& rij_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { - if (net_deriv_tensor.is_gpu()) { - // std::cout << natoms_tensor.dtype() << std::endl; - // const int* natoms = natoms_tensor.data(); - // printf("%d\n", natoms[0]); - return PdProdVirialSeAOpCUDAForward( - net_deriv_tensor, - in_deriv_tensor, - rij_tensor, - nlist_tensor, - natoms_tensor, - n_a_sel, - n_r_sel - ); - } else { - PD_THROW("Unsupported device type for forward function of custom relu operator."); - } -} - - -/*以下是反向代码*/ - -// template -// void PdProdForceSeAOpCPUBackwardKernel( -// int nloc, int nframes, int ndescrpt, int nnei, -// const data_t* grad, const data_t* net_deriv, -// const data_t* in_deriv, const data_t* rij, const int* nlist, -// data_t* grad_net){ - -// #pragma omp parallel for -// for (int kk = 0; kk < nframes; ++kk){ - -// int grad_iter = kk * 9; -// int in_iter = kk * nloc * ndescrpt * 3; -// int rij_iter = kk * nloc * nnei * 3; -// int nlist_iter = kk * nloc * nnei; -// int grad_net_iter = kk * nloc * ndescrpt; -// // int n_a_shift = n_a_sel * 4; - -// deepmd::prod_virial_grad_a_cpu( -// &grad_net[grad_net_iter], -// &grad[grad_iter], -// &in_deriv[in_iter], -// &rij[rij_iter], -// &nlist[nlist_iter], -// nloc, -// nnei); -// } -// } - - -// template -// void PdProdForceSeAOpGPUBackwardKernel( -// int nloc, int nframes, int ndescrpt, int nnei, -// const data_t* virial_grad, const data_t* net_deriv, -// const data_t* in_deriv, const data_t* rij, const int* nlist, -// data_t* grad_net -// ) { -// for (int_64 kk = 0; kk < nframes; ++kk) { -// FPTYPE* grad_net = p_grad_net + kk * nloc * ndescrpt; -// const FPTYPE* virial_grad = p_grad + kk * 9; -// const FPTYPE* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; -// const FPTYPE* rij = p_rij + kk * nloc * nnei * 3; -// const int* nlist = p_nlist + kk * nloc * nnei; -// if (device == "GPU") { -// deepmd::prod_virial_grad_a_gpu_cuda( -// grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei -// ); -// } -// } -// } - - -// std::vector PdProdVirialSeAOpCPUBackward( -// const paddle::Tensor& virial_grad_tensor, -// const paddle::Tensor& net_deriv_tensor, -// const paddle::Tensor& in_deriv_tensor, -// const paddle::Tensor& rij_tensor, -// const paddle::Tensor& nlist_tensor, -// const paddle::Tensor& natoms_tensor, -// int n_a_sel, -// int n_r_sel -// ){ -// // CHECK_INPUT_READY(virial_grad_tensor); -// // CHECK_INPUT_READY(net_deriv_tensor); -// // CHECK_INPUT_READY(in_deriv_tensor); -// // CHECK_INPUT_READY(rij_tensor); -// // CHECK_INPUT_READY(nlist_tensor); -// // CHECK_INPUT_READY(natoms_tensor); - -// auto grad_shape = virial_grad_tensor.shape(); -// auto net_deriv_shape = net_deriv_tensor.shape(); -// auto in_deriv_shape = in_deriv_tensor.shape(); -// auto rij_shape = rij_tensor.shape(); -// auto nlist_shape = nlist_tensor.shape(); -// auto natoms_shape = natoms_tensor.shape(); - -// CHECK_INPUT_DIM(virial_grad_tensor, 2); -// CHECK_INPUT_DIM(net_deriv_tensor, 2); -// CHECK_INPUT_DIM(in_deriv_tensor, 2); -// CHECK_INPUT_DIM(rij_tensor, 2); -// CHECK_INPUT_DIM(nlist_tensor, 2); -// CHECK_INPUT_DIM(natoms_tensor, 1); - -// PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3"); - -// const int* natoms = nullptr; -// // if(natoms_tensor.place() != paddle::PlaceType::kCPU){ -// // natoms = natoms_tensor.copy_to(paddle::PlaceType::kCPU).data(); -// // }else{ -// natoms = natoms_tensor.data(); -// // } -// int nframes = net_deriv_shape[0]; -// int nloc = natoms[0]; -// int ndescrpt = net_deriv_shape[1] / nloc; -// int nnei = nlist_shape[1] / nloc; - -// PD_CHECK(nframes == grad_shape[0], "number of frames should match"); -// PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); -// PD_CHECK(nframes == rij_shape[0], "number of frames should match"); -// PD_CHECK(nframes == nlist_shape[0],"number of samples should match"); -// PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms"); -// PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match"); -// PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be nnei * 3"); -// PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); - -// std::vector grad_net_shape {nframes, nloc * ndescrpt}; -// // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape); -// paddle::Tensor grad_net_tensor = paddle::empty( -// grad_net_shape, -// virial_grad_tensor.dtype(), -// virial_grad_tensor.place() -// ); - -// if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){ -// PD_DISPATCH_FLOATING_TYPES( -// virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] { -// PdProdForceSeAOpCPUBackwardKernel( -// nloc, nframes, ndescrpt, nnei, -// virial_grad_tensor.data(), -// net_deriv_tensor.data(), -// in_deriv_tensor.data(), -// rij_tensor.data(), nlist_tensor.data(), -// grad_net_tensor.mutable_data()); -// })); -// }else{ -// PD_DISPATCH_FLOATING_TYPES( -// virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] { -// PdProdForceSeAOpGPUBackwardKernel( -// nloc, nframes, ndescrpt, nnei, -// virial_grad_tensor.data(), -// net_deriv_tensor.data(), -// in_deriv_tensor.data(), -// rij_tensor.data(), -// nlist_tensor.data(), -// grad_net_tensor.mutable_data()); -// })); -// } -// return {grad_net_tensor}; -// } - - -// std::vector PdProdVirialSeABackward( -// const paddle::Tensor& virial_grad_tensor, -// const paddle::Tensor& net_deriv_tensor, -// const paddle::Tensor& in_deriv_tensor, -// const paddle::Tensor& rij_tensor, -// const paddle::Tensor& nlist_tensor, -// const paddle::Tensor& natoms_tensor, -// int n_a_sel, -// int n_r_sel -// ) { -// return PdProdVirialSeAOpCPUBackward( -// virial_grad_tensor, net_deriv_tensor, in_deriv_tensor, -// rij_tensor, -// nlist_tensor, natoms_tensor, n_a_sel, n_r_sel -// ); -// } - -// std::vector> PdProdVirialSeAInferShape( -// // std::vector x_shape -// std::vector net_deriv_shape, -// std::vector in_deriv_shape, -// std::vector rij_shape, -// std::vector nlist_shape, -// std::vector natoms_shape, -// int n_a_sel, -// int n_r_sel -// ) { -// auto virial_shape {nframes, 9}; -// auto atom_virial_shape {nframes, 9 * nall}; -// return {virial_shape, atom_virial_shape}; -// } - -// std::vector PdProdVirialSeAInferDtype(paddle::DataType x_dtype, ...) { -// return {x_dtype, ...}; -// } - -std::vector> PdProdVirialSeAInferShape( - std::vector net_deriv_shape, - std::vector in_deriv_shape, - std::vector rij_shape, - std::vector nlist_shape, - std::vector natoms_shape, - const int &n_a_sel, - const int &n_r_sel -) { - // int64_t nloc = /*natoms[0]*/ 192; - int64_t nall = /*natoms[1]*/ 192; - int64_t nframes = net_deriv_shape[0]; - - std::vector virial_shape = {nframes, 9}; - std::vector atom_virial_shape = {nframes, 9 * nall}; - - return {virial_shape, atom_virial_shape}; -} - -std::vector PdProdVirialSeAInferDtype( - paddle::DataType net_deriv_dtype, - paddle::DataType in_deriv_dtype, - paddle::DataType rij_dtype, - paddle::DataType nlist_dtype, - paddle::DataType natoms_dtype -) { - return {net_deriv_dtype, net_deriv_dtype}; -} - - -PD_BUILD_OP(prod_virial_se_a) - .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"}) - .Outputs({"virial", "atom_virial"}) - .Attrs({"n_a_sel: int", "n_r_sel: int"}) - .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward)) - .SetInferShapeFn(PD_INFER_SHAPE(PdProdVirialSeAInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(PdProdVirialSeAInferDtype)); - - -// PD_BUILD_GRAD_OP(prod_virial_se_a) -// .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"}) -// .Outputs({paddle::Grad("net_deriv")}) -// .Attrs({"n_a_sel: int", "n_r_sel: int"}) -// .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward)); - - diff --git a/source/lib/paddle_src/prod_virial.h b/source/lib/paddle_src/prod_virial.h deleted file mode 100644 index c51e333a47..0000000000 --- a/source/lib/paddle_src/prod_virial.h +++ /dev/null @@ -1,75 +0,0 @@ -#pragma once - -namespace deepmd { - -template -void prod_virial_a_cpu(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); - -template -void prod_virial_r_cpu(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); - -#if GOOGLE_CUDA -template -void prod_virial_a_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); - -template -void prod_virial_r_gpu_cuda(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -template -void prod_virial_a_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); - -template -void prod_virial_r_gpu_rocm(FPTYPE* virial, - FPTYPE* atom_virial, - const FPTYPE* net_deriv, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nall, - const int nnei); -#endif // TENSORFLOW_USE_ROCM - -} // namespace deepmd diff --git a/source/lib/paddle_src/prod_virial_grad.cc b/source/lib/paddle_src/prod_virial_grad.cc deleted file mode 100644 index 14ba158cc1..0000000000 --- a/source/lib/paddle_src/prod_virial_grad.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include "prod_virial_grad.h" - -#include -#include - -#include "errors.h" - -inline void make_index_range(int& idx_start, - int& idx_end, - const int& nei_idx, - const int& nnei) { - if (nei_idx < nnei) { - idx_start = nei_idx * 4; - idx_end = nei_idx * 4 + 4; - } else { - throw deepmd::deepmd_exception("should no reach here"); - } -} - -template -void deepmd::prod_virial_grad_a_cpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - const int ndescrpt = nnei * 4; - - // reset the frame to 0 - for (int ii = 0; ii < nloc; ++ii) { - for (int aa = 0; aa < ndescrpt; ++aa) { - grad_net[ii * ndescrpt + aa] = 0; - } - } - -// compute grad of one frame -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - int i_idx = ii; - - // loop over neighbors - for (int jj = 0; jj < nnei; ++jj) { - int j_idx = nlist[i_idx * nnei + jj]; - if (j_idx < 0) continue; - int aa_start, aa_end; - make_index_range(aa_start, aa_end, jj, nnei); - for (int aa = aa_start; aa < aa_end; ++aa) { - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - grad_net[i_idx * ndescrpt + aa] -= - -1.0 * grad[dd0 * 3 + dd1] * - rij[i_idx * nnei * 3 + jj * 3 + dd1] * - env_deriv[i_idx * ndescrpt * 3 + aa * 3 + dd0]; - } - } - } - } - } -} - -template void deepmd::prod_virial_grad_a_cpu(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); - -template void deepmd::prod_virial_grad_a_cpu(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); - -template -void deepmd::prod_virial_grad_r_cpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) -// -// grad_net: nloc x ndescrpt -// grad: 9 -// env_deriv: nloc x ndescrpt x 3 -// rij: nloc x nnei x 3 -// nlist: nloc x nnei -// -{ - const int ndescrpt = nnei * 1; - - // reset the frame to 0 - for (int ii = 0; ii < nloc; ++ii) { - for (int aa = 0; aa < ndescrpt; ++aa) { - grad_net[ii * ndescrpt + aa] = 0; - } - } - -// compute grad of one frame -#pragma omp parallel for - for (int ii = 0; ii < nloc; ++ii) { - int i_idx = ii; - - // loop over neighbors - for (int jj = 0; jj < nnei; ++jj) { - int j_idx = nlist[i_idx * nnei + jj]; - if (j_idx < 0) continue; - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - grad_net[i_idx * ndescrpt + jj] -= - -1.0 * grad[dd0 * 3 + dd1] * - rij[i_idx * nnei * 3 + jj * 3 + dd1] * - env_deriv[i_idx * ndescrpt * 3 + jj * 3 + dd0]; - } - } - } - } -} - -template void deepmd::prod_virial_grad_r_cpu(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); - -template void deepmd::prod_virial_grad_r_cpu(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); diff --git a/source/lib/paddle_src/prod_virial_grad.cu b/source/lib/paddle_src/prod_virial_grad.cu deleted file mode 100644 index 6205c4bdf8..0000000000 --- a/source/lib/paddle_src/prod_virial_grad.cu +++ /dev/null @@ -1,536 +0,0 @@ -#include -#include -#include -#include "paddle/extension.h" - -#include "device.h" -#include "prod_virial.h" -#include "gpu_cuda.h" - -#define CHECK_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") -#define CHECK_INPUT_DIM(x, value) PD_CHECK(x.shape().size() == value, #x "'s dim should be " #value ".") -#define CHECK_INPUT_READY(x) PD_CHECK(x.IsInitialized(), #x " must be initialized before usage.") - -// template -// __global__ void atom_virial_reduction(FPTYPE* virial, -// const FPTYPE* atom_virial, -// const int nall) { -// unsigned int bid = blockIdx.x; -// unsigned int tid = threadIdx.x; -// __shared__ FPTYPE data[THREADS_PER_BLOCK]; -// data[tid] = (FPTYPE)0.; -// for (int ii = tid; ii < nall; ii += THREADS_PER_BLOCK) { -// data[tid] += atom_virial[ii * 9 + bid]; -// } -// __syncthreads(); -// // do reduction in shared memory -// for (int ii = THREADS_PER_BLOCK >> 1; ii > 0; ii >>= 1) { -// if (tid < ii) { -// data[tid] += data[tid + ii]; -// } -// __syncthreads(); -// } -// // write result for this block to global memory -// if (tid == 0) virial[bid] = data[0]; -// } - -// template -// __global__ void virial_deriv_wrt_neighbors_a(FPTYPE* virial, -// FPTYPE* atom_virial, -// const FPTYPE* net_deriv, -// const FPTYPE* in_deriv, -// const FPTYPE* rij, -// const int* nlist, -// const int nloc, -// const int nnei) { -// // idx -> nloc -// // idy -> nnei -// // idz = dd0 * 3 + dd1 -// // dd0 = idz / 3 -// // dd1 = idz % 3 -// const int_64 idx = blockIdx.x; -// const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; -// const unsigned int idz = threadIdx.y; -// const int ndescrpt = nnei * 4; -// if (idy >= nnei) { -// return; -// } -// int j_idx = nlist[idx * nnei + idy]; -// if (j_idx < 0) { -// return; -// } -// // atomicAdd( -// // virial + idz, -// // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 -// // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % -// // 3]); -// FPTYPE virial_tmp = (FPTYPE)0.; -// for (int idw = 0; idw < 4; ++idw) { -// virial_tmp += net_deriv[idx * ndescrpt + idy * 4 + idw] * -// rij[idx * nnei * 3 + idy * 3 + idz % 3] * -// in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz / 3]; -// } -// atomicAdd(atom_virial + j_idx * 9 + idz, virial_tmp); -// } - -// template -// __global__ void virial_deriv_wrt_neighbors_r(FPTYPE* virial, -// FPTYPE* atom_virial, -// const FPTYPE* net_deriv, -// const FPTYPE* in_deriv, -// const FPTYPE* rij, -// const int* nlist, -// const int nloc, -// const int nnei) { -// // idx -> nloc -// // idy -> nnei -// // idz = dd0 * 3 + dd1 -// // dd0 = idz / 3 -// // dd1 = idz % 3 -// const int_64 idx = blockIdx.x; -// const unsigned int idy = blockIdx.y * blockDim.x + threadIdx.x; -// const unsigned int idz = threadIdx.y; -// const int ndescrpt = nnei * 1; - -// if (idy >= nnei) { -// return; -// } -// int j_idx = nlist[idx * nnei + idy]; -// if (j_idx < 0) { -// return; -// } -// // atomicAdd( -// // virial + idz, -// // net_deriv[idx * ndescrpt + idy * 4 + idw] * rij[idx * nnei * 3 + idy * 3 -// // + idz / 3] * in_deriv[idx * ndescrpt * 3 + (idy * 4 + idw) * 3 + idz % -// // 3]); -// atomicAdd(atom_virial + j_idx * 9 + idz, -// net_deriv[idx * ndescrpt + idy] * -// rij[idx * nnei * 3 + idy * 3 + idz % 3] * -// in_deriv[idx * ndescrpt * 3 + idy * 3 + idz / 3]); -// } - -// namespace deepmd { -// template -// void prod_virial_a_gpu_cuda(FPTYPE* virial, -// FPTYPE* atom_virial, -// const FPTYPE* net_deriv, -// const FPTYPE* in_deriv, -// const FPTYPE* rij, -// const int* nlist, -// const int nloc, -// const int nall, -// const int nnei) { -// DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); -// DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - -// const int LEN = 16; -// int nblock = (nnei + LEN - 1) / LEN; -// dim3 block_grid(nloc, nblock); -// dim3 thread_grid(LEN, 9); -// // compute virial of a frame -// virial_deriv_wrt_neighbors_a<<>>( -// virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); -// DPErrcheck(cudaGetLastError()); -// DPErrcheck(cudaDeviceSynchronize()); -// // reduction atom_virial to virial -// atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); -// DPErrcheck(cudaGetLastError()); -// DPErrcheck(cudaDeviceSynchronize()); -// } - -// template -// void prod_virial_r_gpu_cuda(FPTYPE* virial, -// FPTYPE* atom_virial, -// const FPTYPE* net_deriv, -// const FPTYPE* in_deriv, -// const FPTYPE* rij, -// const int* nlist, -// const int nloc, -// const int nall, -// const int nnei) { -// DPErrcheck(cudaMemset(virial, 0, sizeof(FPTYPE) * 9)); -// DPErrcheck(cudaMemset(atom_virial, 0, sizeof(FPTYPE) * 9 * nall)); - -// const int LEN = 16; -// int nblock = (nnei + LEN - 1) / LEN; -// dim3 block_grid(nloc, nblock); -// dim3 thread_grid(LEN, 9); -// // compute virial of a frame -// virial_deriv_wrt_neighbors_r<<>>( -// virial, atom_virial, net_deriv, in_deriv, rij, nlist, nloc, nnei); -// DPErrcheck(cudaGetLastError()); -// DPErrcheck(cudaDeviceSynchronize()); -// // reduction atom_virial to virial -// atom_virial_reduction<<<9, TPB>>>(virial, atom_virial, nall); -// DPErrcheck(cudaGetLastError()); -// DPErrcheck(cudaDeviceSynchronize()); -// } -// } // namespace deepmd - -// template -// void PdProdVirialSeAOpForwardCUDAKernel( -// int nloc, int nall, int ndescrpt, int nnei, int nframes, -// data_t* p_virial, data_t* p_atom_virial, const data_t* p_net_deriv, -// const data_t* p_in_deriv, const data_t* p_rij, const int* p_nlist){ - -// for(int kk = 0; kk < nframes; ++kk){ -// data_t * virial = p_virial + kk * 9; -// data_t * atom_virial = p_atom_virial + kk * nall * 9; -// const data_t * net_deriv = p_net_deriv + kk * nloc * ndescrpt; -// const data_t * in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; -// const data_t * rij = p_rij + kk * nloc * nnei * 3; -// const int * nlist = p_nlist + kk * nloc * nnei; -// deepmd::prod_virial_a_gpu_cuda( -// virial, atom_virial, -// net_deriv, in_deriv, rij, nlist, nloc, nall, nnei); -// } -// } - -// std::vector PdProdVirialSeAOpCUDAForward( -// const paddle::Tensor& net_deriv_tensor, -// const paddle::Tensor& in_deriv_tensor, -// const paddle::Tensor& rij_tensor, -// const paddle::Tensor& nlist_tensor, -// const paddle::Tensor& natoms_tensor, -// int n_a_sel, -// int n_r_sel -// ){ -// CHECK_INPUT(net_deriv_tensor); -// CHECK_INPUT(in_deriv_tensor); -// CHECK_INPUT(rij_tensor); -// CHECK_INPUT(nlist_tensor); -// // CHECK_INPUT(natoms_tensor); // TODO: 暂时指定python端必须为cpu,gpu的copy_to会导致返回的指针数据不对 - -// CHECK_INPUT_DIM(net_deriv_tensor, 2); -// CHECK_INPUT_DIM(in_deriv_tensor, 2); -// CHECK_INPUT_DIM(rij_tensor, 2); -// CHECK_INPUT_DIM(nlist_tensor, 2); -// CHECK_INPUT_DIM(natoms_tensor, 1); - -// PD_CHECK(natoms_tensor.shape()[0] >= 3, "number of atoms should be larger than (or equal to) 3"); -// const int* natoms = natoms_tensor.data(); -// // printf("natoms_tensor.numel() = %d\n", natoms_tensor.numel()); -// int nloc = natoms[0]; -// // printf("nloc = %d\n", nloc); -// int nall = natoms[1]; -// // printf("nall = %d\n", nall); -// int nnei = nlist_tensor.shape()[1] / nloc; -// int nframes = net_deriv_tensor.shape()[0]; -// int ndescrpt = net_deriv_tensor.shape()[1] / nloc; -// PD_CHECK(nframes == in_deriv_tensor.shape()[0], "number of samples should match"); -// PD_CHECK(nframes == rij_tensor.shape()[0], "number of samples should match"); -// PD_CHECK(nframes == nlist_tensor.shape()[0],"number of samples should match"); -// PD_CHECK(nloc * ndescrpt * 3 == in_deriv_tensor.shape()[1], "number of descriptors should match"); -// PD_CHECK((nloc * nnei * 3) == rij_tensor.shape()[1], "dim of rij should be nnei * 3"); - -// std::vector virial_shape {nframes, 9}; -// std::vector atom_virial_shape {nframes, 9 * nall}; -// paddle::Tensor virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, virial_shape); -// paddle::Tensor atom_virial_tensor = paddle::Tensor(paddle::PlaceType::kGPU, atom_virial_shape); - -// PD_DISPATCH_FLOATING_TYPES( -// net_deriv_tensor.type(), "pd_prod_virial_se_a_cpu_forward_kernel", ([&] { -// PdProdVirialSeAOpForwardCUDAKernel( -// nloc, nall, ndescrpt, nnei, nframes, -// virial_tensor.mutable_data(), atom_virial_tensor.mutable_data(), -// net_deriv_tensor.data(), in_deriv_tensor.data(), -// rij_tensor.data(), nlist_tensor.data()); -// })); - -// return {virial_tensor, atom_virial_tensor}; -// } - -// std::vector PdProdVirialSeAForward( -// const paddle::Tensor& net_deriv_tensor, -// const paddle::Tensor& in_deriv_tensor, -// const paddle::Tensor& rij_tensor, -// const paddle::Tensor& nlist_tensor, -// const paddle::Tensor& natoms_tensor, -// int n_a_sel, -// int n_r_sel -// ) { -// if (net_deriv_tensor.is_gpu()) { -// // std::cout << natoms_tensor.dtype() << std::endl; -// // const int* natoms = natoms_tensor.data(); -// // printf("%d\n", natoms[0]); -// return PdProdVirialSeAOpCUDAForward( -// net_deriv_tensor, -// in_deriv_tensor, -// rij_tensor, -// nlist_tensor, -// natoms_tensor, -// n_a_sel, -// n_r_sel -// ); -// } else { -// PD_THROW("Unsupported device type for forward function of custom relu operator."); -// } -// } - - -/*以下是反向代码*/ - -// template -// void PdProdForceSeAOpCPUBackwardKernel( -// int nloc, int nframes, int ndescrpt, int nnei, -// const data_t* grad, const data_t* net_deriv, -// const data_t* in_deriv, const data_t* rij, const int* nlist, -// data_t* grad_net){ - -// #pragma omp parallel for -// for (int kk = 0; kk < nframes; ++kk){ - -// int grad_iter = kk * 9; -// int in_iter = kk * nloc * ndescrpt * 3; -// int rij_iter = kk * nloc * nnei * 3; -// int nlist_iter = kk * nloc * nnei; -// int grad_net_iter = kk * nloc * ndescrpt; -// // int n_a_shift = n_a_sel * 4; - -// deepmd::prod_virial_grad_a_cpu( -// &grad_net[grad_net_iter], -// &grad[grad_iter], -// &in_deriv[in_iter], -// &rij[rij_iter], -// &nlist[nlist_iter], -// nloc, -// nnei); -// } -// } - - -template -__device__ inline FPTYPE dev_dot9(const FPTYPE* arr1, const FPTYPE* arr2) { - FPTYPE result = (FPTYPE)0.0; - for (int ii = 0; ii < 9; ii++) { - result += arr1[ii] * arr2[ii]; - } - return result; -} - - -template -__global__ void virial_grad_wrt_neighbors_a(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - // idy -> nnei - const unsigned int tid = threadIdx.x; - const int_64 idx = blockIdx.x * blockDim.x + tid; - const unsigned int idy = blockIdx.y; - const unsigned int idw = threadIdx.y; - const int ndescrpt = nnei * 4; - __shared__ FPTYPE grad_one[9]; - if (tid < 9) { - grad_one[tid] = grad[tid]; - } - __syncthreads(); - if (idx >= nloc) { - return; - } - int j_idx = nlist[idx * nnei + idy]; - if (j_idx < 0) { - return; - } - FPTYPE tmp[9]; - for (int dd0 = 0; dd0 < 3; ++dd0) { - for (int dd1 = 0; dd1 < 3; ++dd1) { - tmp[dd0 * 3 + dd1] = - rij[idx * nnei * 3 + idy * 3 + dd1] * - env_deriv[idx * ndescrpt * 3 + idy * 4 * 3 + idw * 3 + dd0]; - } - } - grad_net[idx * ndescrpt + idy * 4 + idw] -= - (FPTYPE)-1.0 * dev_dot9(grad_one, tmp); -} - - -namespace deepmd { -template -void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei) { - const int ndescrpt = nnei * 4; - DPErrcheck(cudaMemset(grad_net, 0, sizeof(FPTYPE) * nloc * ndescrpt)); - const int LEN = 128; - const int nblock = (nloc + LEN - 1) / LEN; - dim3 block_grid(nblock, nnei); - dim3 thread_grid(LEN, 4); - virial_grad_wrt_neighbors_a<<>>( - grad_net, grad, env_deriv, rij, nlist, nloc, nnei); - DPErrcheck(cudaGetLastError()); - DPErrcheck(cudaDeviceSynchronize()); -} - - -template void prod_virial_grad_a_gpu_cuda(float* grad_net, - const float* grad, - const float* env_deriv, - const float* rij, - const int* nlist, - const int nloc, - const int nnei); -template void prod_virial_grad_a_gpu_cuda(double* grad_net, - const double* grad, - const double* env_deriv, - const double* rij, - const int* nlist, - const int nloc, - const int nnei); -} // namespace deepmd - - -template -void PdProdForceSeAOpGPUBackwardKernel( - int nloc, int nframes, int ndescrpt, int nnei, - const data_t* virial_grad, const data_t* net_deriv, - const data_t* in_deriv, const data_t* rij, const int* nlist, - data_t* grad_net -) { - data_t* p_grad_net = grad_net; - const data_t* p_grad = virial_grad; - const data_t* p_in_deriv = in_deriv; - const data_t* p_rij = rij; - const int* p_nlist = nlist; - for (int_64 kk = 0; kk < nframes; ++kk) { - data_t* grad_net = p_grad_net + kk * nloc * ndescrpt; - const data_t* virial_grad = p_grad + kk * 9; - const data_t* in_deriv = p_in_deriv + kk * nloc * ndescrpt * 3; - const data_t* rij = p_rij + kk * nloc * nnei * 3; - const int* nlist = p_nlist + kk * nloc * nnei; - // if (device == "GPU") { - deepmd::prod_virial_grad_a_gpu_cuda( - grad_net, virial_grad, in_deriv, rij, nlist, nloc, nnei - ); - // } - } -} - - -std::vector PdProdVirialSeAOpCUDABackward( - const paddle::Tensor& virial_grad_tensor, - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& rij_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -){ - // CHECK_INPUT_READY(virial_grad_tensor); - // CHECK_INPUT_READY(net_deriv_tensor); - // CHECK_INPUT_READY(in_deriv_tensor); - // CHECK_INPUT_READY(rij_tensor); - // CHECK_INPUT_READY(nlist_tensor); - // CHECK_INPUT_READY(natoms_tensor); - - auto grad_shape = virial_grad_tensor.shape(); - auto net_deriv_shape = net_deriv_tensor.shape(); - auto in_deriv_shape = in_deriv_tensor.shape(); - auto rij_shape = rij_tensor.shape(); - auto nlist_shape = nlist_tensor.shape(); - auto natoms_shape = natoms_tensor.shape(); - - CHECK_INPUT_DIM(virial_grad_tensor, 2); - CHECK_INPUT_DIM(net_deriv_tensor, 2); - CHECK_INPUT_DIM(in_deriv_tensor, 2); - CHECK_INPUT_DIM(rij_tensor, 2); - CHECK_INPUT_DIM(nlist_tensor, 2); - CHECK_INPUT_DIM(natoms_tensor, 1); - - PD_CHECK(natoms_shape[0] >= 3, "number of atoms should be larger than (or equal to) 3"); - - const int* natoms = nullptr; - // if(natoms_tensor.place() != paddle::PlaceType::kCPU){ - // natoms = natoms_tensor.copy_to(paddle::PlaceType::kCPU).data(); - // }else{ - natoms = natoms_tensor.data(); - // } - int nframes = net_deriv_shape[0]; - int nloc = natoms[0]; - int ndescrpt = net_deriv_shape[1] / nloc; - int nnei = nlist_shape[1] / nloc; - - PD_CHECK(nframes == grad_shape[0], "number of frames should match"); - PD_CHECK(nframes == in_deriv_shape[0], "number of samples should match"); - PD_CHECK(nframes == rij_shape[0], "number of frames should match"); - PD_CHECK(nframes == nlist_shape[0],"number of samples should match"); - PD_CHECK(9 == grad_shape[1], "input grad shape should be 3 x natoms"); - PD_CHECK(nloc * ndescrpt * 3 == in_deriv_shape[1], "number of descriptors should match"); - PD_CHECK(nloc * nnei * 3 == rij_shape[1], "dim of rij should be nnei * 3"); - PD_CHECK(nnei == (n_a_sel + n_r_sel), "number of neighbors should match"); - - std::vector grad_net_shape {nframes, nloc * ndescrpt}; - // paddle::Tensor grad_net_tensor = paddle::Tensor(paddle::PlaceType::kCPU, grad_net_shape); - paddle::Tensor grad_net_tensor = paddle::empty( - grad_net_shape, - virial_grad_tensor.dtype(), - virial_grad_tensor.place() - ); - - // if(virial_grad_tensor.place() == paddle::PlaceType::kCPU){ - // PD_DISPATCH_FLOATING_TYPES( - // virial_grad_tensor.type(), "pd_prod_force_se_a_cpu_backward_kernel", ([&] { - // PdProdForceSeAOpCPUBackwardKernel( - // nloc, nframes, ndescrpt, nnei, - // virial_grad_tensor.data(), - // net_deriv_tensor.data(), - // in_deriv_tensor.data(), - // rij_tensor.data(), nlist_tensor.data(), - // grad_net_tensor.mutable_data()); - // })); - // }else{ - PD_DISPATCH_FLOATING_TYPES( - virial_grad_tensor.type(), "pd_prod_force_se_a_cuda_backward_kernel", ([&] { - PdProdForceSeAOpGPUBackwardKernel( - nloc, nframes, ndescrpt, nnei, - virial_grad_tensor.data(), - net_deriv_tensor.data(), - in_deriv_tensor.data(), - rij_tensor.data(), - nlist_tensor.data(), - grad_net_tensor.mutable_data()); - })); - // } - return {grad_net_tensor}; -} - - -std::vector PdProdVirialSeABackward( - const paddle::Tensor& virial_grad_tensor, - const paddle::Tensor& net_deriv_tensor, - const paddle::Tensor& in_deriv_tensor, - const paddle::Tensor& rij_tensor, - const paddle::Tensor& nlist_tensor, - const paddle::Tensor& natoms_tensor, - int n_a_sel, - int n_r_sel -) { - return PdProdVirialSeAOpCUDABackward( - virial_grad_tensor, net_deriv_tensor, in_deriv_tensor, - rij_tensor, - nlist_tensor, natoms_tensor, n_a_sel, n_r_sel - ); -} - - -// PD_BUILD_OP(prod_virial_se_a) -// .Inputs({"net_deriv", "in_deriv", "rij", "nlist", "natoms"}) -// .Outputs({"virial", "atom_virial"}) -// .Attrs({"n_a_sel: int", "n_r_sel: int"}) -// .SetKernelFn(PD_KERNEL(PdProdVirialSeAForward)); - - -PD_BUILD_GRAD_OP(prod_virial_se_a) - .Inputs({paddle::Grad("virial"), "net_deriv", "in_deriv", "rij", "nlist", "natoms"}) - .Outputs({paddle::Grad("net_deriv")}) - .Attrs({"n_a_sel: int", "n_r_sel: int"}) - .SetKernelFn(PD_KERNEL(PdProdVirialSeABackward)); diff --git a/source/lib/paddle_src/prod_virial_grad.h b/source/lib/paddle_src/prod_virial_grad.h deleted file mode 100644 index 0e2cc46baa..0000000000 --- a/source/lib/paddle_src/prod_virial_grad.h +++ /dev/null @@ -1,63 +0,0 @@ -#pragma once - -namespace deepmd { - -template -void prod_virial_grad_a_cpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); - -template -void prod_virial_grad_r_cpu(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); - -#if GOOGLE_CUDA -template -void prod_virial_grad_a_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); - -template -void prod_virial_grad_r_gpu_cuda(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); -#endif // GOOGLE_CUDA - -#if TENSORFLOW_USE_ROCM -template -void prod_virial_grad_a_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); - -template -void prod_virial_grad_r_gpu_rocm(FPTYPE* grad_net, - const FPTYPE* grad, - const FPTYPE* env_deriv, - const FPTYPE* rij, - const int* nlist, - const int nloc, - const int nnei); -#endif // TENSORFLOW_USE_ROCM - -} // namespace deepmd diff --git a/source/lib/paddle_src/setup_ins.py b/source/lib/paddle_src/setup_ins.py index f510bd62f4..402b47cbc5 100644 --- a/source/lib/paddle_src/setup_ins.py +++ b/source/lib/paddle_src/setup_ins.py @@ -2,7 +2,7 @@ cpp_extension.setup( name="paddle_deepmd_lib", - ext_modules=cpp_extension.CppExtension( + ext_modules=cpp_extension.CUDAExtension( sources=[ "../src/coord.cc", "../src/env_mat_nvnmd.cc", @@ -14,11 +14,11 @@ "../src/neighbor_list.cc", "../src/pair_tab.cc", "../src/prod_env_mat_nvnmd.cc", - "../src/prod_env_mat.cc", - # "../src/prod_force_grad.cc", - # "../src/prod_force.cc", - # "../src/prod_virial_grad.cc", - # "../src/prod_virial.cc", + # "../src/prod_env_mat.cc", + "../src/prod_force_grad.cc", + "../src/prod_force.cc", + "../src/prod_virial_grad.cc", + "../src/prod_virial.cc", "../src/region.cc", "../src/SimulationRegion.cpp", "../src/soft_min_switch_force_grad.cc", @@ -37,18 +37,22 @@ # "../src/cuda/prod_virial.cu", "../src/cuda/region.cu", "../src/cuda/tabulate.cu", - "./prod_env_mat.cu", - "./prod_virial_grad.cu", - "./prod_virial_grad.cc", - "./prod_virial.cu", - "./prod_force.cu", - # "./prod_force_grad.cc", - "./prod_force_grad.cu", - "./neighbor_stat.cu", + "./paddle_prod_env_mat.cu", + "./paddle_prod_env_mat.cc", + "./paddle_prod_virial_grad.cu", + "./paddle_prod_virial_grad.cc", + "./paddle_prod_virial.cu", + "./paddle_prod_virial.cc", + "./paddle_prod_force.cu", + "./paddle_prod_force.cc", + "./paddle_prod_force_grad.cu", + "./paddle_prod_force_grad.cc", + "./paddle_neighbor_stat.cc", ], include_dirs=[ "/workspace/hesensen/deepmd_backend/deepmd-kit-tf/source/lib/include" ], library_dirs=["/usr/local/cuda-11/lib64"], + # extra_link_args=["-ldeepmd"] ), ) diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc index af82a09c2e..4dc38dd4ef 100644 --- a/source/lib/src/prod_env_mat.cc +++ b/source/lib/src/prod_env_mat.cc @@ -269,7 +269,7 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist, memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *)); memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **)); const int ago = mesh_host[0]; - if (ago == 0 || gpu_inlist.inum < inlist.inum) { + if (ago == 0 || gpu_inlist.inum < inlist.inum || !gpu_inlist.ilist) { const int inum = inlist.inum; if (gpu_inlist.inum < inum) { delete_device_memory(gpu_inlist.ilist); @@ -279,6 +279,15 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist, malloc_device_memory(gpu_inlist.numneigh, inum); malloc_device_memory(gpu_inlist.firstneigh, inum); } + if (!gpu_inlist.ilist) { + malloc_device_memory(gpu_inlist.ilist, inum); + } + if (!gpu_inlist.numneigh) { + malloc_device_memory(gpu_inlist.numneigh, inum); + } + if (!gpu_inlist.firstneigh) { + malloc_device_memory(gpu_inlist.firstneigh, inum); + } memcpy_host_to_device(gpu_inlist.ilist, inlist.ilist, inum); memcpy_host_to_device(gpu_inlist.numneigh, inlist.numneigh, inum); int _max_nbor_size = max_numneigh(inlist); diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp index ccf95c0984..8922260e5f 100644 --- a/source/lmp/pair_deepmd.cpp +++ b/source/lmp/pair_deepmd.cpp @@ -833,7 +833,8 @@ void PairDeepMD::settings(int narg, char **arg) { numb_models = models.size(); if (numb_models == 1) { try { - deep_pot.init(arg[0], get_node_rank(), get_file_content(arg[0])); + auto ptr = strstr(arg[0], ".pb"); + deep_pot.init(arg[0], get_node_rank(), (ptr != NULL) ? get_file_content(arg[0]) : ""); } catch (deepmd_compat::deepmd_exception &e) { error->one(FLERR, e.what()); }