Merge pull request #768 from brucefan1983/hip

From CUDA only to CUDA+HIP
brucefan1983 · Oct 30, 2024 · aad40ec · aad40ec
2 parents 49bda9b + 96e2b59
commit aad40ec
Show file tree

Hide file tree

Showing 113 changed files with 1,002 additions and 570 deletions.
diff --git a/src/force/dftd3.cu b/src/force/dftd3.cu
@@ -32,6 +32,7 @@ J. Comput. Chem., 32, 1456 (2011).
 #include "model/box.cuh"
 #include "neighbor.cuh"
 #include "utilities/common.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <algorithm>
 #include <cctype>
 #include <iostream>
@@ -947,7 +948,7 @@ void DFTD3::compute_small_box(
     r12.data() + size_x12 * 3,
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5);
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_coordination_number_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -959,7 +960,7 @@ void DFTD3::compute_small_box(
     r12.data() + size_x12 * 4,
     r12.data() + size_x12 * 5,
     cn.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   add_dftd3_force_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -979,7 +980,7 @@ void DFTD3::compute_small_box(
     virial_per_atom.data(),
     dc6_sum.data(),
     dc8_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   add_dftd3_force_extra_small_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -996,7 +997,7 @@ void DFTD3::compute_small_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DFTD3::compute_large_box(
@@ -1058,7 +1059,7 @@ void DFTD3::compute_large_box(
     position_per_atom.data() + N,
     position_per_atom.data() + N * 2,
     cn.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_force_large_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -1084,7 +1085,7 @@ void DFTD3::compute_large_box(
     virial_per_atom.data(),
     dc6_sum.data(),
     dc8_sum.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   find_dftd3_force_extra_large_box<<<(N - 1) / 64 + 1, 64>>>(
     dftd3_para,
@@ -1107,7 +1108,7 @@ void DFTD3::compute_large_box(
     force_per_atom.data() + N,
     force_per_atom.data() + N * 2,
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
 
 void DFTD3::compute(

diff --git a/src/force/dftd3para.cuh b/src/force/dftd3para.cuh
@@ -15,6 +15,8 @@
 
 #pragma once
 
+#include "utilities/gpu_macro.cuh"
+
 namespace
 {
 #define Bohr 0.5291772575069165f

diff --git a/src/force/eam.cu b/src/force/eam.cu
@@ -22,6 +22,7 @@ The EAM potential. Currently two analytical versions:
 #include "eam.cuh"
 #include "neighbor.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 #define BLOCK_SIZE_FORCE 64
 
@@ -514,7 +515,7 @@ void EAM::compute(
       position_per_atom.data() + number_of_atoms * 2,
       eam_data.Fp.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_force_eam_step2<0><<<grid_size, BLOCK_SIZE_FORCE>>>(
       eam2004zhou,
@@ -535,7 +536,7 @@ void EAM::compute(
       force_per_atom.data() + 2 * number_of_atoms,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   if (potential_model == 1) {
@@ -554,7 +555,7 @@ void EAM::compute(
       position_per_atom.data() + number_of_atoms * 2,
       eam_data.Fp.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     find_force_eam_step2<1><<<grid_size, BLOCK_SIZE_FORCE>>>(
       eam2004zhou,
@@ -575,6 +576,6 @@ void EAM::compute(
       force_per_atom.data() + 2 * number_of_atoms,
       virial_per_atom.data(),
       potential_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
diff --git a/src/force/fcp.cu b/src/force/fcp.cu
@@ -19,6 +19,7 @@ The force constant potential (FCP)
 
 #include "fcp.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <cstring>
 #include <vector>
 
@@ -1038,7 +1039,7 @@ void FCP::compute(
     position_per_atom.data() + number_of_atoms * 2,
     fcp_data.r0.data(),
     fcp_data.u.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   fcp_data.pfv.fill(0.0f);
 
@@ -1125,5 +1126,5 @@ void FCP::compute(
     force_per_atom.data() + 2 * number_of_atoms,
     virial_per_atom.data());
 
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 }
diff --git a/src/force/force.cu b/src/force/force.cu
@@ -28,6 +28,7 @@ The driver class calculating force and related quantities.
 #include "ilp_tmd_sw.cuh"
 #include "utilities/common.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include "utilities/read_file.cuh"
 #include <cstring>
 #include <iostream>
@@ -106,7 +107,7 @@ void Force::parse_potential(
     strcmp(potential_name, "nep4_temperature") == 0 ||
     strcmp(potential_name, "nep4_zbl_temperature") == 0) {
     int num_gpus;
-    CHECK(cudaGetDeviceCount(&num_gpus));
+    CHECK(gpuGetDeviceCount(&num_gpus));
 #ifdef ZHEYONG
     num_gpus = 3;
 #endif
@@ -226,7 +227,7 @@ static __global__ void gpu_sum_force(int N, double* g_fx, double* g_fy, double*
   s_f[tid] = f;
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_f[tid] += s_f[tid + offset];
@@ -466,7 +467,7 @@ void Force::compute(
     force_per_atom.data() + number_of_atoms * 2,
     potential_per_atom.data(),
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   if (multiple_potentials_mode_.compare("observe") == 0) {
     // If observing, calculate using main potential only
@@ -516,7 +517,7 @@ void Force::compute(
       force_per_atom.data(),
       virial_per_atom.data(),
       (double)potentials.size());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
   }
@@ -552,7 +553,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -561,7 +562,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 
   // always correct the force when using the FCP potential
@@ -574,7 +575,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
 
       gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
         number_of_atoms,
@@ -583,7 +584,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
@@ -647,7 +648,7 @@ static __global__ void gpu_sum_tensor(int N, double* g_tensor, double* g_sum_ten
   s_t[tid] = t;
   __syncthreads();
 
-#pragma unroll
+
   for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1) {
     if (tid < offset) {
       s_t[tid] += s_t[tid + offset];
@@ -754,7 +755,7 @@ void Force::compute(
     force_per_atom.data() + number_of_atoms * 2,
     potential_per_atom.data(),
     virial_per_atom.data());
-  CUDA_CHECK_KERNEL
+  GPU_CHECK_KERNEL
 
   temperature += delta_T;
   if (multiple_potentials_mode_.compare("observe") == 0) {
@@ -805,7 +806,7 @@ void Force::compute(
       force_per_atom.data(),
       virial_per_atom.data(),
       (double)potentials.size());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     PRINT_INPUT_ERROR("Invalid mode for multiple potentials.\n");
   }
@@ -841,7 +842,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -850,7 +851,7 @@ void Force::compute(
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms,
       ftot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else if (compute_hnemdec_ == 0) {
     // the tensor:
     // xx xy xz    0 3 4
@@ -876,10 +877,10 @@ void Force::compute(
       virial_per_atom.data() + 8 * number_of_atoms,
       virial_per_atom.data() + 2 * number_of_atoms,
       tensor_per_atom.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_sum_tensor<<<9, 1024>>>(number_of_atoms, tensor_per_atom.data(), tensor_tot.data());
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
     gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
       number_of_atoms,
@@ -901,7 +902,7 @@ void Force::compute(
       force_per_atom.data(),
       force_per_atom.data() + number_of_atoms,
       force_per_atom.data() + 2 * number_of_atoms);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
 
   } else if (compute_hnemdec_ != -1) {
     gpu_add_driving_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
@@ -926,7 +927,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
 
       gpu_correct_force<<<(number_of_atoms - 1) / 128 + 1, 128>>>(
         number_of_atoms,
@@ -935,7 +936,7 @@ void Force::compute(
         force_per_atom.data() + number_of_atoms,
         force_per_atom.data() + 2 * number_of_atoms,
         ftot.data());
-      CUDA_CHECK_KERNEL
+      GPU_CHECK_KERNEL
     }
   }
 }
diff --git a/src/force/force_constant.cu b/src/force/force_constant.cu
@@ -23,6 +23,7 @@ Use finite difference to calculate the seconod order force constants：
 #include "model/box.cuh"
 #include "model/group.cuh"
 #include "utilities/error.cuh"
+#include "utilities/gpu_macro.cuh"
 #include <vector>
 
 static __global__ void gpu_shift_atom(const double dx, double* x) { x[0] += dx; }
@@ -34,13 +35,13 @@ static void shift_atom(
 
   if (beta == 0) {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else if (beta == 1) {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   } else {
     gpu_shift_atom<<<1, 1>>>(dx, position_per_atom.data() + number_of_atoms * 2 + n2);
-    CUDA_CHECK_KERNEL
+    GPU_CHECK_KERNEL
   }
 }
 
@@ -67,10 +68,10 @@ static void get_f(
     box, position_per_atom, type, group, potential_per_atom, force_per_atom, virial_per_atom);
 
   size_t M = sizeof(double);
-  CHECK(cudaMemcpy(f + 0, force_per_atom.data() + n1, M, cudaMemcpyDeviceToHost));
-  CHECK(cudaMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, cudaMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(f + 0, force_per_atom.data() + n1, M, gpuMemcpyDeviceToHost));
+  CHECK(gpuMemcpy(f + 1, force_per_atom.data() + n1 + number_of_atoms, M, gpuMemcpyDeviceToHost));
   CHECK(
-    cudaMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, cudaMemcpyDeviceToHost));
+    gpuMemcpy(f + 2, force_per_atom.data() + n1 + number_of_atoms * 2, M, gpuMemcpyDeviceToHost));
 
   shift_atom(-dx, n2, beta, position_per_atom);
 }