Improve GPU performance especially without electrostatics

HamiedAsadi · Sep 17, 2020 · 9136af8 · 9136af8
1 parent a6ad643
commit 9136af8
Show file tree

Hide file tree

Showing 2 changed files with 61 additions and 50 deletions.
diff --git a/src/GPU/CalculateEnergyCUDAKernel.cu b/src/GPU/CalculateEnergyCUDAKernel.cu
@@ -65,11 +65,13 @@ void CallBoxInterGPU(VariablesCUDA *vars,
   CUMALLOC((void**) &gpu_particleCharge, particleCharge.size() * sizeof(double));
   CUMALLOC((void**) &gpu_particleKind, particleKind.size() * sizeof(int));
   CUMALLOC((void**) &gpu_particleMol, particleMol.size() * sizeof(int));
-  CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double));
   CUMALLOC((void**) &gpu_LJEn, energyVectorLen * sizeof(double));
-  CUMALLOC((void**) &gpu_final_REn, sizeof(double));
   CUMALLOC((void**) &gpu_final_LJEn, sizeof(double));
-
+  if (electrostatic) {
+    CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double));
+    CUMALLOC((void**) &gpu_final_REn, sizeof(double));
+  }
+
   // Copy necessary data to GPU
   cudaMemcpy(gpu_neighborList, &neighborlist1D[0], neighborListCount * sizeof(int), cudaMemcpyHostToDevice);
   cudaMemcpy(gpu_cellStartIndex, &cellStartIndex[0], cellStartIndex.size() * sizeof(int), cudaMemcpyHostToDevice);
@@ -85,9 +87,9 @@ void CallBoxInterGPU(VariablesCUDA *vars,
   boxAxes.GetAxis(box).y,
   boxAxes.GetAxis(box).z);
 
-  double3 halfAx = make_double3(boxAxes.GetAxis(box).x / 2.0,
-                                boxAxes.GetAxis(box).y / 2.0,
-                                boxAxes.GetAxis(box).z / 2.0);
+  double3 halfAx = make_double3(boxAxes.GetAxis(box).x * 0.5,
+                                boxAxes.GetAxis(box).y * 0.5,
+                                boxAxes.GetAxis(box).z * 0.5);
 
   BoxInterGPU <<< blocksPerGrid, threadsPerBlock>>>(gpu_cellStartIndex,
       vars->gpu_cellVector,
@@ -143,28 +145,37 @@ void CallBoxInterGPU(VariablesCUDA *vars,
   // ReduceSum
   void * d_temp_storage = NULL;
   size_t temp_storage_bytes = 0;
-  DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
-                    gpu_final_REn, energyVectorLen);
-  CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
-  DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
-                    gpu_final_REn, energyVectorLen);
   // LJ ReduceSum
   DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
                     gpu_final_LJEn, energyVectorLen);
-  CUFREE(d_temp_storage);
+  CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
+  DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
+                    gpu_final_LJEn, energyVectorLen);
   // Copy back the result to CPU ! :)
-  CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
-                          cudaMemcpyDeviceToHost));
   CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double),
                           cudaMemcpyDeviceToHost));
+  if (electrostatic) {
+    // Real Term ReduceSum
+    DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
+                      gpu_final_REn, energyVectorLen);
+    // Copy back the result to CPU ! :)
+    CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
+                            cudaMemcpyDeviceToHost));
+  }
+  else {
+    REn = 0.0;
+  }
+  CUFREE(d_temp_storage);
 
   CUFREE(gpu_particleCharge);
   CUFREE(gpu_particleKind);
   CUFREE(gpu_particleMol);
-  CUFREE(gpu_REn);
   CUFREE(gpu_LJEn);
-  CUFREE(gpu_final_REn);
   CUFREE(gpu_final_LJEn);
+  if (electrostatic) {
+    CUFREE(gpu_REn);
+    CUFREE(gpu_final_REn);
+  }
   CUFREE(gpu_neighborList);
   CUFREE(gpu_cellStartIndex);
 }
@@ -247,7 +258,7 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,
 
     if(currentParticle < neighborParticle && gpu_particleMol[currentParticle] != gpu_particleMol[neighborParticle]) {
       // Check if they are within rcut
-      double distSq = 0;
+      double distSq = 0.0;
 
       if(InRcutGPU(distSq, gpu_x, gpu_y, gpu_z, 
                    currentParticle, neighborParticle,
@@ -264,6 +275,10 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,
 
         double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
                                        gpu_molIndex, gpu_kindIndex, gpu_lambdaVDW);
+        LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn,
+                          gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0],
+                           gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6,
+                          sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst);
 
         if(electrostatic) {
           static const double qqFact = 167000.0;
@@ -278,14 +293,12 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,
                                 sc_sigma_6, sc_alpha, sc_power, gpu_sigmaSq,
                                 gpu_count[0]);
         }
-        LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn,
-                          gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0],
-                           gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6,
-                          sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst);
       }
     }
   }
-  gpu_REn[threadID] = REn;
+  if(electrostatic) {
+    gpu_REn[threadID] = REn;
+  }
   gpu_LJEn[threadID] = LJEn;
 }
 

diff --git a/src/GPU/CalculateForceCUDAKernel.cu b/src/GPU/CalculateForceCUDAKernel.cu
@@ -664,16 +664,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,
 
   virComponents = make_double3(0.0, 0.0, 0.0);
 
-  double pRF = 0.0, qi_qj, pVF = 0.0;
-  double lambdaVDW = 0.0, lambdaCoulomb = 0.0;
   int threadID = blockIdx.x * blockDim.x + threadIdx.x;
-  //tensors for VDW and real part of electrostatic
+  //tensors for VDW
   gpu_vT11[threadID] = 0.0, gpu_vT22[threadID] = 0.0, gpu_vT33[threadID] = 0.0;
   // extra tensors reserved for later on
   gpu_vT12[threadID] = 0.0, gpu_vT13[threadID] = 0.0, gpu_vT23[threadID] = 0.0;
 
   if(electrostatic) {
-    //tensors for VDW and real part of electrostatic
+    //tensors for real part of electrostatic
     gpu_rT11[threadID] = 0.0, gpu_rT22[threadID] = 0.0, gpu_rT33[threadID] = 0.0;
     // extra tensors reserved for later on
     gpu_rT12[threadID] = 0.0, gpu_rT13[threadID] = 0.0, gpu_rT23[threadID] = 0.0;
@@ -720,36 +718,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,
         int mA = gpu_particleMol[currentParticle];
         int mB = gpu_particleMol[neighborParticle];
 
-        lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
+        double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
                                        gpu_molIndex, gpu_kindIndex, 
                                        gpu_lambdaVDW);
 
         diff_com = Difference(gpu_comx, gpu_comy, gpu_comz, mA, mB);
         diff_com = MinImageGPU(diff_com, axis, halfAx);
 
-        if(electrostatic) {
-          qi_qj = cA * cB;
-          lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box,
-                                                 gpu_isFraction, gpu_molIndex,
-                                                 gpu_kindIndex, gpu_lambdaCoulomb);
-          pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0],
-                                    gpu_isMartini[0], gpu_alpha[box],
-                                    gpu_rCutCoulomb[box], gpu_diElectric_1[0],
-                                    gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha,
-                                    sc_power, lambdaCoulomb, gpu_count[0],
-                                    kA, kB);
-
-          gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x);
-          gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y);
-          gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z);
-
-          //extra tensor calculations
-          gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
-          gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
-          gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));
-        }
-
-        pVF = CalcEnForceGPU(distSq, kA, kB,
+        double pVF = CalcEnForceGPU(distSq, kA, kB,
                              gpu_sigmaSq, gpu_n, gpu_epsilon_Cn, gpu_rCut[0],
                              gpu_rOn[0], gpu_isMartini[0], gpu_VDW_Kind[0],
                              gpu_count[0], lambdaVDW, sc_sigma_6, sc_alpha,
@@ -763,6 +739,28 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,
         gpu_vT12[threadID] += pVF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
         gpu_vT13[threadID] += pVF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
         gpu_vT23[threadID] += pVF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));
+
+        if(electrostatic) {
+          double qi_qj = cA * cB;
+          double lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box,
+                                                        gpu_isFraction, gpu_molIndex,
+                                                        gpu_kindIndex, gpu_lambdaCoulomb);
+          double pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0],
+                                           gpu_isMartini[0], gpu_alpha[box],
+                                           gpu_rCutCoulomb[box], gpu_diElectric_1[0],
+                                           gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha,
+                                           sc_power, lambdaCoulomb, gpu_count[0],
+                                           kA, kB);
+
+          gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x);
+          gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y);
+          gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z);
+
+          //extra tensor calculations
+          gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
+          gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
+          gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));
+        }
       }
     }
   }