Initial performance optimizations that reduce memcpy and mallocs

GOMC-WSU · Jun 19, 2024 · ba4ecb4 · ba4ecb4
1 parent ff3f67a
commit ba4ecb4
Showing 8 changed files with 374 additions and 275 deletions.
diff --git a/src/Ewald.cpp b/src/Ewald.cpp
@@ -489,13 +489,27 @@ double Ewald::SwapDestRecip(const cbmc::TrialMol &newMol, const uint box,
     uint length = thisKind.NumAtoms();
 #ifdef GOMC_CUDA
     bool insert = true;
-    std::vector<double> MolCharge;
-    for (uint p = 0; p < length; p++) {
-      MolCharge.push_back(thisKind.AtomCharge(p));
+    std::vector<double> molCharges;
+	int charges = 0;
+    for (uint p = 0; p < length; ++p) {
+	  if (thisKind.AtomCharge(p) != 0.0) {
+        molCharges.push_back(thisKind.AtomCharge(p));
+        if (p > charges) {
+          molCoords.Set(charges, molCoords[p]);
+         }
+        charges++;
+      }
+    }
+
+    CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges,
+                          imageSizeRef[box], sumRnew[box], sumInew[box],
+                          sumRref[box], sumIref[box], insert, energyRecipNew, box);
+    //If there are no charged particles, the energy doesn't change, but we need
+	//to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays
+	//have correct values
+	if (charges == 0) {
+      energyRecipNew = sysPotRef.boxEnergy[box].recip;
     }
-    CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge,
-                          imageSizeRef[box], sumRnew[box], sumInew[box], insert,
-                          energyRecipNew, box);
 #else
     uint startAtom = mols.MolStart(molIndex);
 #ifdef _OPENMP
@@ -668,14 +682,26 @@ double Ewald::SwapSourceRecip(const cbmc::TrialMol &oldMol, const uint box,
     uint length = thisKind.NumAtoms();
 #ifdef GOMC_CUDA
     bool insert = false;
-    std::vector<double> MolCharge;
-    for (uint p = 0; p < length; p++) {
-      MolCharge.push_back(thisKind.AtomCharge(p));
+    std::vector<double> molCharges;
+	int charges = 0;
+    for (uint p = 0; p < length; ++p) {
+	  if (thisKind.AtomCharge(p) != 0.0) {
+        molCharges.push_back(thisKind.AtomCharge(p));
+        if (p > charges) {
+          molCoords.Set(charges, molCoords[p]);
+        }
+        charges++;
+      }
+    }
+    CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges,
+                          imageSizeRef[box], sumRnew[box], sumInew[box],
+                          sumRref[box], sumIref[box], insert, energyRecipNew, box);
+    //If there are no charged particles, the energy doesn't change, but we need
+	//to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays
+	//have correct values
+	if (charges == 0) {
+      energyRecipNew = sysPotRef.boxEnergy[box].recip;
     }
-    CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge,
-                          imageSizeRef[box], sumRnew[box], sumInew[box], insert,
-                          energyRecipNew, box);
-
 #else
     uint startAtom = mols.MolStart(molIndex);
 #ifdef _OPENMP
@@ -725,6 +751,15 @@ double Ewald::MolExchangeReciprocal(const std::vector<cbmc::TrialMol> &newMol,
 
   if (box < BOXES_WITH_U_NB) {
     GOMC_EVENT_START(1, GomcProfileEvent::RECIP_MEMC_ENERGY);
+// Because MolExchangeReciprocal does not have a matching GPU function, this is
+// a stub function to copy the GPU sumRref and sumIref vectors to the CPU in
+// order to calcuate the new sums. If this function is ported to the GPU, this
+// call should be removed.
+#ifdef GOMC_CUDA
+  CallMolExchangeReciprocalStartGPU(ff.particles->getCUDAVars(), imageSizeRef[box],
+                               sumRref[box], sumIref[box], box);
+#endif
+
     uint lengthNew, lengthOld;
     MoleculeKind const &thisKindNew = newMol[0].GetKind();
     MoleculeKind const &thisKindOld = oldMol[0].GetKind();

diff --git a/src/GPU/CalculateEnergyCUDAKernel.cu b/src/GPU/CalculateEnergyCUDAKernel.cu
@@ -40,10 +40,9 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
   int energyVectorLen;
   double *gpu_particleCharge;
   double *gpu_REn, *gpu_LJEn;
-  double *gpu_final_REn, *gpu_final_LJEn;
 
   // Run the kernel
-  threadsPerBlock = 256;
+  threadsPerBlock = 128;
   blocksPerGrid = numberOfCells * NUMBER_OF_NEIGHBOR_CELL;
   energyVectorLen = blocksPerGrid * threadsPerBlock;
 
@@ -62,10 +61,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
   CUMALLOC((void **)&gpu_particleKind, particleKind.size() * sizeof(int));
   CUMALLOC((void **)&gpu_particleMol, particleMol.size() * sizeof(int));
   CUMALLOC((void **)&gpu_LJEn, energyVectorLen * sizeof(double));
-  CUMALLOC((void **)&gpu_final_LJEn, sizeof(double));
   if (electrostatic) {
     CUMALLOC((void **)&gpu_REn, energyVectorLen * sizeof(double));
-    CUMALLOC((void **)&gpu_final_REn, sizeof(double));
   }
 
   // Copy necessary data to GPU
@@ -108,27 +105,29 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
       vars->gpu_Invcell_z[box], sc_coul, sc_sigma_6, sc_alpha, sc_power,
       vars->gpu_rMin, vars->gpu_rMaxSq, vars->gpu_expConst, vars->gpu_molIndex,
       vars->gpu_lambdaVDW, vars->gpu_lambdaCoulomb, vars->gpu_isFraction, box);
+#ifndef NDEBUG
   cudaDeviceSynchronize();
   checkLastErrorCUDA(__FILE__, __LINE__);
+#endif
 
   // ReduceSum
   void *d_temp_storage = NULL;
   size_t temp_storage_bytes = 0;
   // LJ ReduceSum
   DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
-                    gpu_final_LJEn, energyVectorLen);
+                    vars->gpu_finalVal, energyVectorLen);
   CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
   DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
-                    gpu_final_LJEn, energyVectorLen);
+                    vars->gpu_finalVal, energyVectorLen);
   // Copy back the result to CPU ! :)
-  CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double),
+  CubDebugExit(cudaMemcpy(&LJEn, vars->gpu_finalVal, sizeof(double),
                           cudaMemcpyDeviceToHost));
   if (electrostatic) {
     // Real Term ReduceSum
     DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
-                      gpu_final_REn, energyVectorLen);
+                      vars->gpu_finalVal, energyVectorLen);
     // Copy back the result to CPU ! :)
-    CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
+    CubDebugExit(cudaMemcpy(&REn, vars->gpu_finalVal, sizeof(double),
                             cudaMemcpyDeviceToHost));
   } else {
     REn = 0.0;
@@ -139,10 +138,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector<int> &cellVector,
   CUFREE(gpu_particleKind);
   CUFREE(gpu_particleMol);
   CUFREE(gpu_LJEn);
-  CUFREE(gpu_final_LJEn);
   if (electrostatic) {
     CUFREE(gpu_REn);
-    CUFREE(gpu_final_REn);
   }
   CUFREE(gpu_neighborList);
   CUFREE(gpu_cellStartIndex);