From ba4ecb4caa63bbc4c82a19e6e167e238afdeb69b Mon Sep 17 00:00:00 2001 From: Loren Schwiebert Date: Wed, 19 Jun 2024 10:22:02 -0400 Subject: [PATCH] Initial performance optimizations that reduce memcpy and mallocs --- src/Ewald.cpp | 61 ++++- src/GPU/CalculateEnergyCUDAKernel.cu | 19 +- src/GPU/CalculateEwaldCUDAKernel.cu | 293 ++++++++++++----------- src/GPU/CalculateEwaldCUDAKernel.cuh | 30 ++- src/GPU/CalculateForceCUDAKernel.cu | 129 +++++----- src/GPU/ConstantDefinitionsCUDAKernel.cu | 34 ++- src/GPU/TransformParticlesCUDAKernel.cu | 22 +- src/GPU/VariablesCUDA.cuh | 61 +++-- 8 files changed, 374 insertions(+), 275 deletions(-) diff --git a/src/Ewald.cpp b/src/Ewald.cpp index de23f1bc7..da70d8832 100644 --- a/src/Ewald.cpp +++ b/src/Ewald.cpp @@ -489,13 +489,27 @@ double Ewald::SwapDestRecip(const cbmc::TrialMol &newMol, const uint box, uint length = thisKind.NumAtoms(); #ifdef GOMC_CUDA bool insert = true; - std::vector MolCharge; - for (uint p = 0; p < length; p++) { - MolCharge.push_back(thisKind.AtomCharge(p)); + std::vector molCharges; + int charges = 0; + for (uint p = 0; p < length; ++p) { + if (thisKind.AtomCharge(p) != 0.0) { + molCharges.push_back(thisKind.AtomCharge(p)); + if (p > charges) { + molCoords.Set(charges, molCoords[p]); + } + charges++; + } + } + + CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges, + imageSizeRef[box], sumRnew[box], sumInew[box], + sumRref[box], sumIref[box], insert, energyRecipNew, box); + //If there are no charged particles, the energy doesn't change, but we need + //to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays + //have correct values + if (charges == 0) { + energyRecipNew = sysPotRef.boxEnergy[box].recip; } - CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge, - imageSizeRef[box], sumRnew[box], sumInew[box], insert, - energyRecipNew, box); #else uint startAtom = mols.MolStart(molIndex); #ifdef _OPENMP @@ -668,14 +682,26 @@ double Ewald::SwapSourceRecip(const cbmc::TrialMol &oldMol, const uint box, uint length = thisKind.NumAtoms(); #ifdef GOMC_CUDA bool insert = false; - std::vector MolCharge; - for (uint p = 0; p < length; p++) { - MolCharge.push_back(thisKind.AtomCharge(p)); + std::vector molCharges; + int charges = 0; + for (uint p = 0; p < length; ++p) { + if (thisKind.AtomCharge(p) != 0.0) { + molCharges.push_back(thisKind.AtomCharge(p)); + if (p > charges) { + molCoords.Set(charges, molCoords[p]); + } + charges++; + } + } + CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, molCharges, + imageSizeRef[box], sumRnew[box], sumInew[box], + sumRref[box], sumIref[box], insert, energyRecipNew, box); + //If there are no charged particles, the energy doesn't change, but we need + //to run CallSwapReciprocalGPU to make sure the sumRnew and sumInew arrays + //have correct values + if (charges == 0) { + energyRecipNew = sysPotRef.boxEnergy[box].recip; } - CallSwapReciprocalGPU(ff.particles->getCUDAVars(), molCoords, MolCharge, - imageSizeRef[box], sumRnew[box], sumInew[box], insert, - energyRecipNew, box); - #else uint startAtom = mols.MolStart(molIndex); #ifdef _OPENMP @@ -725,6 +751,15 @@ double Ewald::MolExchangeReciprocal(const std::vector &newMol, if (box < BOXES_WITH_U_NB) { GOMC_EVENT_START(1, GomcProfileEvent::RECIP_MEMC_ENERGY); +// Because MolExchangeReciprocal does not have a matching GPU function, this is +// a stub function to copy the GPU sumRref and sumIref vectors to the CPU in +// order to calcuate the new sums. If this function is ported to the GPU, this +// call should be removed. +#ifdef GOMC_CUDA + CallMolExchangeReciprocalStartGPU(ff.particles->getCUDAVars(), imageSizeRef[box], + sumRref[box], sumIref[box], box); +#endif + uint lengthNew, lengthOld; MoleculeKind const &thisKindNew = newMol[0].GetKind(); MoleculeKind const &thisKindOld = oldMol[0].GetKind(); diff --git a/src/GPU/CalculateEnergyCUDAKernel.cu b/src/GPU/CalculateEnergyCUDAKernel.cu index ff486d6b5..e3c0c7205 100644 --- a/src/GPU/CalculateEnergyCUDAKernel.cu +++ b/src/GPU/CalculateEnergyCUDAKernel.cu @@ -40,10 +40,9 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector &cellVector, int energyVectorLen; double *gpu_particleCharge; double *gpu_REn, *gpu_LJEn; - double *gpu_final_REn, *gpu_final_LJEn; // Run the kernel - threadsPerBlock = 256; + threadsPerBlock = 128; blocksPerGrid = numberOfCells * NUMBER_OF_NEIGHBOR_CELL; energyVectorLen = blocksPerGrid * threadsPerBlock; @@ -62,10 +61,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector &cellVector, CUMALLOC((void **)&gpu_particleKind, particleKind.size() * sizeof(int)); CUMALLOC((void **)&gpu_particleMol, particleMol.size() * sizeof(int)); CUMALLOC((void **)&gpu_LJEn, energyVectorLen * sizeof(double)); - CUMALLOC((void **)&gpu_final_LJEn, sizeof(double)); if (electrostatic) { CUMALLOC((void **)&gpu_REn, energyVectorLen * sizeof(double)); - CUMALLOC((void **)&gpu_final_REn, sizeof(double)); } // Copy necessary data to GPU @@ -108,27 +105,29 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector &cellVector, vars->gpu_Invcell_z[box], sc_coul, sc_sigma_6, sc_alpha, sc_power, vars->gpu_rMin, vars->gpu_rMaxSq, vars->gpu_expConst, vars->gpu_molIndex, vars->gpu_lambdaVDW, vars->gpu_lambdaCoulomb, vars->gpu_isFraction, box); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif // ReduceSum void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; // LJ ReduceSum DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, - gpu_final_LJEn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes)); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, - gpu_final_LJEn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); // Copy back the result to CPU ! :) - CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double), + CubDebugExit(cudaMemcpy(&LJEn, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost)); if (electrostatic) { // Real Term ReduceSum DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn, - gpu_final_REn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); // Copy back the result to CPU ! :) - CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double), + CubDebugExit(cudaMemcpy(&REn, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost)); } else { REn = 0.0; @@ -139,10 +138,8 @@ void CallBoxInterGPU(VariablesCUDA *vars, const std::vector &cellVector, CUFREE(gpu_particleKind); CUFREE(gpu_particleMol); CUFREE(gpu_LJEn); - CUFREE(gpu_final_LJEn); if (electrostatic) { CUFREE(gpu_REn); - CUFREE(gpu_final_REn); } CUFREE(gpu_neighborList); CUFREE(gpu_cellStartIndex); diff --git a/src/GPU/CalculateEwaldCUDAKernel.cu b/src/GPU/CalculateEwaldCUDAKernel.cu index 5812426b6..25396a7da 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cu +++ b/src/GPU/CalculateEwaldCUDAKernel.cu @@ -36,46 +36,34 @@ void CallBoxReciprocalSetupGPU(VariablesCUDA *vars, XYZArray const &coords, double *prefact, double *hsqr, double &energyRecip, uint box) { double *gpu_particleCharge; - double *gpu_energyRecip; - double *gpu_final_energyRecip; int atomNumber = coords.Count(); CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_energyRecip, imageSize * sizeof(double)); - CUMALLOC((void **)&gpu_final_energyRecip, sizeof(double)); cudaMemcpy(gpu_particleCharge, &particleCharge[0], particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_x, coords.x, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_y, coords.y, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_z, coords.z, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_kx[box], kx, imageSize * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_ky[box], ky, imageSize * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_kz[box], kz, imageSize * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_prefact[box], prefact, imageSize * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_hsqr[box], hsqr, imageSize * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemset(vars->gpu_sumRnew[box], 0, imageSize * sizeof(double)); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemset(vars->gpu_sumInew[box], 0, imageSize * sizeof(double)); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif dim3 threadsPerBlock(256, 1, 1); dim3 blocksPerGrid((int)(imageSize / threadsPerBlock.x) + 1, @@ -84,37 +72,33 @@ void CallBoxReciprocalSetupGPU(VariablesCUDA *vars, XYZArray const &coords, vars->gpu_x, vars->gpu_y, vars->gpu_z, vars->gpu_kx[box], vars->gpu_ky[box], vars->gpu_kz[box], atomNumber, gpu_particleCharge, vars->gpu_sumRnew[box], vars->gpu_sumInew[box], imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif // Need just one thread per image for this kernel. blocksPerGrid.y = 1; BoxReciprocalGPU<<>>( vars->gpu_prefact[box], vars->gpu_sumRnew[box], vars->gpu_sumInew[box], - gpu_energyRecip, imageSize); + vars->gpu_recipEnergies, imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif - cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); + // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); // ReduceSum - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecip, - gpu_final_energyRecip, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecip, - gpu_final_energyRecip, imageSize); - cudaMemcpy(&energyRecip, gpu_final_energyRecip, sizeof(double), + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecip, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_particleCharge); - CUFREE(gpu_energyRecip); - CUFREE(gpu_final_energyRecip); - CUFREE(d_temp_storage); } // Use this function when calculating the reciprocal terms @@ -124,31 +108,24 @@ void CallBoxReciprocalSumsGPU(VariablesCUDA *vars, XYZArray const &coords, uint imageSize, double *sumRnew, double *sumInew, double &energyRecip, uint box) { double *gpu_particleCharge; - double *gpu_energyRecip; - double *gpu_final_energyRecip; int atomNumber = coords.Count(); CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_energyRecip, imageSize * sizeof(double)); - CUMALLOC((void **)&gpu_final_energyRecip, sizeof(double)); cudaMemcpy(gpu_particleCharge, &particleCharge[0], particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_x, coords.x, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_y, coords.y, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemcpy(vars->gpu_z, coords.z, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemset(vars->gpu_sumRnew[box], 0, imageSize * sizeof(double)); - checkLastErrorCUDA(__FILE__, __LINE__); cudaMemset(vars->gpu_sumInew[box], 0, imageSize * sizeof(double)); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif dim3 threadsPerBlock(256, 1, 1); dim3 blocksPerGrid((int)(imageSize / threadsPerBlock.x) + 1, @@ -158,37 +135,33 @@ void CallBoxReciprocalSumsGPU(VariablesCUDA *vars, XYZArray const &coords, vars->gpu_kyRef[box], vars->gpu_kzRef[box], atomNumber, gpu_particleCharge, vars->gpu_sumRnew[box], vars->gpu_sumInew[box], imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif // Need just one thread per image for this kernel. blocksPerGrid.y = 1; BoxReciprocalGPU<<>>( vars->gpu_prefactRef[box], vars->gpu_sumRnew[box], vars->gpu_sumInew[box], - gpu_energyRecip, imageSize); + vars->gpu_recipEnergies, imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif - cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); + // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); // ReduceSum - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecip, - gpu_final_energyRecip, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecip, - gpu_final_energyRecip, imageSize); - cudaMemcpy(&energyRecip, gpu_final_energyRecip, sizeof(double), + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecip, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_particleCharge); - CUFREE(gpu_energyRecip); - CUFREE(gpu_final_energyRecip); - CUFREE(d_temp_storage); } __global__ void BoxReciprocalSumsGPU(double *gpu_x, double *gpu_y, @@ -233,13 +206,13 @@ __global__ void BoxReciprocalSumsGPU(double *gpu_x, double *gpu_y, } __global__ void BoxReciprocalGPU(double *gpu_prefact, double *gpu_sumRnew, - double *gpu_sumInew, double *gpu_energyRecip, + double *gpu_sumInew, double *gpu_recipEnergies, int imageSize) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; if (threadID >= imageSize) return; - gpu_energyRecip[threadID] = ((gpu_sumRnew[threadID] * gpu_sumRnew[threadID] + + gpu_recipEnergies[threadID] = ((gpu_sumRnew[threadID] * gpu_sumRnew[threadID] + gpu_sumInew[threadID] * gpu_sumInew[threadID]) * gpu_prefact[threadID]); } @@ -254,12 +227,9 @@ void CallMolReciprocalGPU(VariablesCUDA *vars, XYZArray const ¤tCoords, int newCoordsNumber = newCoords.Count(); double *gpu_particleCharge; int blocksPerGrid, threadsPerBlock; - double *gpu_energyRecipNew, *gpu_final_energyRecipNew; CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_energyRecipNew, imageSize * sizeof(double)); - CUMALLOC((void **)&gpu_final_energyRecipNew, sizeof(double)); cudaMemcpy(gpu_particleCharge, &particleCharge[0], particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice); @@ -278,38 +248,32 @@ void CallMolReciprocalGPU(VariablesCUDA *vars, XYZArray const ¤tCoords, cudaMemcpy(vars->gpu_nz, newCoords.z, newCoordsNumber * sizeof(double), cudaMemcpyHostToDevice); - threadsPerBlock = 256; - blocksPerGrid = (int)(imageSize / threadsPerBlock) + 1; + threadsPerBlock = 128; + blocksPerGrid = (imageSize + threadsPerBlock - 1) / threadsPerBlock; MolReciprocalGPU<<>>( vars->gpu_x, vars->gpu_y, vars->gpu_z, vars->gpu_nx, vars->gpu_ny, vars->gpu_nz, vars->gpu_kxRef[box], vars->gpu_kyRef[box], vars->gpu_kzRef[box], atomNumber, gpu_particleCharge, vars->gpu_sumRnew[box], vars->gpu_sumInew[box], vars->gpu_sumRref[box], - vars->gpu_sumIref[box], vars->gpu_prefactRef[box], gpu_energyRecipNew, + vars->gpu_sumIref[box], vars->gpu_prefactRef[box], vars->gpu_recipEnergies, imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif - cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); + // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); // ReduceSum - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - cudaMemcpy(&energyRecipNew, gpu_final_energyRecipNew, sizeof(double), + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_particleCharge); - CUFREE(gpu_energyRecipNew); - CUFREE(gpu_final_energyRecipNew); - CUFREE(d_temp_storage); } // Calculate reciprocal term for lambdaNew and Old with same coordinates @@ -323,12 +287,9 @@ void CallChangeLambdaMolReciprocalGPU(VariablesCUDA *vars, int atomNumber = coords.Count(); double *gpu_particleCharge; int blocksPerGrid, threadsPerBlock; - double *gpu_energyRecipNew, *gpu_final_energyRecipNew; CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_energyRecipNew, imageSize * sizeof(double)); - CUMALLOC((void **)&gpu_final_energyRecipNew, sizeof(double)); cudaMemcpy(gpu_particleCharge, &particleCharge[0], particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice); @@ -347,49 +308,46 @@ void CallChangeLambdaMolReciprocalGPU(VariablesCUDA *vars, vars->gpu_kyRef[box], vars->gpu_kzRef[box], atomNumber, gpu_particleCharge, vars->gpu_sumRnew[box], vars->gpu_sumInew[box], vars->gpu_sumRref[box], vars->gpu_sumIref[box], vars->gpu_prefactRef[box], - gpu_energyRecipNew, lambdaCoef, imageSize); - + vars->gpu_recipEnergies, lambdaCoef, imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif - cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); + // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); // ReduceSum - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - cudaMemcpy(&energyRecipNew, gpu_final_energyRecipNew, sizeof(double), + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_particleCharge); - CUFREE(gpu_energyRecipNew); - CUFREE(gpu_final_energyRecipNew); - CUFREE(d_temp_storage); } void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords, const std::vector &particleCharge, uint imageSize, double *sumRnew, double *sumInew, - const bool insert, double &energyRecipNew, - uint box) { + double *sumRref, double *sumIref, const bool insert, + double &energyRecipNew, uint box) { + //If there are no charged particles in this atom, skip the computation + if (particleCharge.size() == 0) { + CopyRefToNewCUDA(vars, box, imageSize); + // std::memcpy(sumRnew, sumRref, sizeof(double) * imageSize); + // std::memcpy(sumInew, sumIref, sizeof(double) * imageSize); + return; + } // Calculate atom number int atomNumber = coords.Count(); // given coordinates double *gpu_particleCharge; int blocksPerGrid, threadsPerBlock; - double *gpu_energyRecipNew, *gpu_final_energyRecipNew; CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_energyRecipNew, imageSize * sizeof(double)); - CUMALLOC((void **)&gpu_final_energyRecipNew, sizeof(double)); cudaMemcpy(gpu_particleCharge, &particleCharge[0], particleCharge.size() * sizeof(double), cudaMemcpyHostToDevice); @@ -401,39 +359,32 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, XYZArray const &coords, cudaMemcpy(vars->gpu_z, coords.z, atomNumber * sizeof(double), cudaMemcpyHostToDevice); - threadsPerBlock = 256; - blocksPerGrid = (int)(imageSize / threadsPerBlock) + 1; + threadsPerBlock = 128; + blocksPerGrid = (imageSize + threadsPerBlock - 1) / threadsPerBlock; + // NewSwapReciprocalGPU<<>>( + // vars, atomNumber, box, gpu_particleCharge, insert, imageSize); SwapReciprocalGPU<<>>( vars->gpu_x, vars->gpu_y, vars->gpu_z, vars->gpu_kxRef[box], - vars->gpu_kyRef[box], vars->gpu_kzRef[box], atomNumber, + vars->gpu_kyRef[box], vars->gpu_kzRef[box], particleCharge.size(), gpu_particleCharge, vars->gpu_sumRnew[box], vars->gpu_sumInew[box], vars->gpu_sumRref[box], vars->gpu_sumIref[box], vars->gpu_prefactRef[box], - insert, gpu_energyRecipNew, imageSize); + insert, vars->gpu_recipEnergies, imageSize); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); - //#ifndef NDEBUG - // In the future maybe we could remove this for Nondebug? - cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), - cudaMemcpyDeviceToHost); - //#endif +#endif + // cudaMemcpy(sumRnew, vars->gpu_sumRnew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); + // cudaMemcpy(sumInew, vars->gpu_sumInew[box], imageSize * sizeof(double), + // cudaMemcpyDeviceToHost); // ReduceSum - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_energyRecipNew, - gpu_final_energyRecipNew, imageSize); - cudaMemcpy(&energyRecipNew, gpu_final_energyRecipNew, sizeof(double), + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_recipEnergies, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&energyRecipNew, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_particleCharge); - CUFREE(gpu_energyRecipNew); - CUFREE(gpu_final_energyRecipNew); - CUFREE(d_temp_storage); } void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize, @@ -444,6 +395,14 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, uint imageSize, cudaMemcpyHostToDevice); } +void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars, uint imageSize, + double *sumRref, double *sumIref, uint box) { + cudaMemcpy(sumRref, vars->gpu_sumRref[box], imageSize * sizeof(double), + cudaMemcpyDeviceToHost); + cudaMemcpy(sumIref, vars->gpu_sumIref[box], imageSize * sizeof(double), + cudaMemcpyDeviceToHost); +} + void CallBoxForceReciprocalGPU( VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec, const std::vector &particleCharge, @@ -512,8 +471,10 @@ void CallBoxForceReciprocalGPU( cudaMemcpyHostToDevice); cudaMemcpy(gpu_lengthMol, &lengthMol[0], sizeof(int) * lengthMol.size(), cudaMemcpyHostToDevice); - +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif + BoxForceReciprocalGPU<<>>( vars->gpu_aForceRecx, vars->gpu_aForceRecy, vars->gpu_aForceRecz, vars->gpu_mForceRecx, vars->gpu_mForceRecy, vars->gpu_mForceRecz, @@ -527,8 +488,10 @@ void CallBoxForceReciprocalGPU( vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], vars->gpu_nonOrth, boxAxes.GetAxis(box).x, boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z, box, atomCount); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(atomForceRec.x, vars->gpu_aForceRecx, sizeof(double) * atomCount, cudaMemcpyDeviceToHost); @@ -543,7 +506,9 @@ void CallBoxForceReciprocalGPU( cudaMemcpy(molForceRec.z, vars->gpu_mForceRecz, sizeof(double) * molCount, cudaMemcpyDeviceToHost); +#ifndef NDEBUG cudaDeviceSynchronize(); +#endif delete[] arr_particleHasNoCharge; CUFREE(gpu_particleCharge); CUFREE(gpu_particleHasNoCharge); @@ -647,6 +612,42 @@ __global__ void BoxForceReciprocalGPU( atomicAdd(&gpu_mForceRecz[moleculeID], forceZ); } +__global__ void NewSwapReciprocalGPU(VariablesCUDA *vars, int atomNumber, uint box, + double *gpu_particleCharge, const bool insert, + int imageSize) { + + int threadID = blockIdx.x * blockDim.x + threadIdx.x; + if (threadID >= imageSize) return; + + double sumReal = 0.0, sumImaginary = 0.0; + + #pragma unroll 4 + for (int p=0; p < atomNumber; ++p) { + double dotProduct = + DotProductGPU((vars->gpu_kx[box])[threadID], (vars->gpu_ky[box])[threadID], (vars->gpu_kz[box])[threadID], + vars->gpu_x[p], vars->gpu_y[p], vars->gpu_z[p]); + double dotsin, dotcos; + sincos(dotProduct, &dotsin, &dotcos); + sumReal += (gpu_particleCharge[p] * dotcos); + sumImaginary += (gpu_particleCharge[p] * dotsin); + } + + // If we insert the molecule into the box, we add the sum value. + // Otherwise, we subtract the sum value + if (insert) { + (vars->gpu_sumRnew[box])[threadID] = (vars->gpu_sumRref[box])[threadID] + sumReal; + (vars->gpu_sumInew[box])[threadID] = (vars->gpu_sumIref[box])[threadID] + sumImaginary; + } else { + (vars->gpu_sumRnew[box])[threadID] = (vars->gpu_sumRref[box])[threadID] - sumReal; + (vars->gpu_sumInew[box])[threadID] = (vars->gpu_sumIref[box])[threadID] - sumImaginary; + } + + vars->gpu_recipEnergies[threadID] = + (((vars->gpu_sumRnew[box])[threadID] * (vars->gpu_sumRnew[box])[threadID] + + (vars->gpu_sumInew[box])[threadID] * (vars->gpu_sumInew[box])[threadID]) * + (vars->gpu_prefactRef[box])[threadID]); +} + __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_kx, double *gpu_ky, double *gpu_kz, int atomNumber, @@ -654,14 +655,15 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_sumRnew, double *gpu_sumInew, double *gpu_sumRref, double *gpu_sumIref, double *gpu_prefactRef, const bool insert, - double *gpu_energyRecipNew, int imageSize) { + double *gpu_recipEnergies, int imageSize) { + int threadID = blockIdx.x * blockDim.x + threadIdx.x; - if (threadID >= imageSize) - return; + if (threadID >= imageSize) return; double sumReal = 0.0, sumImaginary = 0.0; - for (int p = 0; p < atomNumber; p++) { + #pragma unroll 4 + for (int p=0; p < atomNumber; ++p) { double dotProduct = DotProductGPU(gpu_kx[threadID], gpu_ky[threadID], gpu_kz[threadID], gpu_x[p], gpu_y[p], gpu_z[p]); @@ -671,7 +673,7 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, sumImaginary += (gpu_particleCharge[p] * dotsin); } - // If we insert the molecule to the box, we add the sum value. + // If we insert the molecule into the box, we add the sum value. // Otherwise, we subtract the sum value if (insert) { gpu_sumRnew[threadID] = gpu_sumRref[threadID] + sumReal; @@ -681,7 +683,7 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, gpu_sumInew[threadID] = gpu_sumIref[threadID] - sumImaginary; } - gpu_energyRecipNew[threadID] = + gpu_recipEnergies[threadID] = ((gpu_sumRnew[threadID] * gpu_sumRnew[threadID] + gpu_sumInew[threadID] * gpu_sumInew[threadID]) * gpu_prefactRef[threadID]); @@ -694,15 +696,15 @@ __global__ void MolReciprocalGPU(double *gpu_cx, double *gpu_cy, double *gpu_cz, double *gpu_sumRnew, double *gpu_sumInew, double *gpu_sumRref, double *gpu_sumIref, double *gpu_prefactRef, - double *gpu_energyRecipNew, int imageSize) { + double *gpu_recipEnergies, int imageSize) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; if (threadID >= imageSize) return; - double sumRealOld = 0.0, sumImaginaryOld = 0.0; - double sumRealNew = 0.0, sumImaginaryNew = 0.0; + double sumReal = 0.0, sumImaginary = 0.0; - for (int p = 0; p < atomNumber; p++) { + #pragma unroll 4 + for (int p = 0; p < atomNumber; ++p) { double dotProductOld = DotProductGPU(gpu_kx[threadID], gpu_ky[threadID], gpu_kz[threadID], gpu_cx[p], gpu_cy[p], gpu_cz[p]); @@ -711,19 +713,18 @@ __global__ void MolReciprocalGPU(double *gpu_cx, double *gpu_cy, double *gpu_cz, gpu_nx[p], gpu_ny[p], gpu_nz[p]); double oldsin, oldcos; sincos(dotProductOld, &oldsin, &oldcos); - sumRealOld += (gpu_particleCharge[p] * oldcos); - sumImaginaryOld += (gpu_particleCharge[p] * oldsin); + sumReal -= (gpu_particleCharge[p] * oldcos); + sumImaginary -= (gpu_particleCharge[p] * oldsin); double newsin, newcos; sincos(dotProductNew, &newsin, &newcos); - sumRealNew += (gpu_particleCharge[p] * newcos); - sumImaginaryNew += (gpu_particleCharge[p] * newsin); + sumReal += (gpu_particleCharge[p] * newcos); + sumImaginary += (gpu_particleCharge[p] * newsin); } - gpu_sumRnew[threadID] = gpu_sumRref[threadID] - sumRealOld + sumRealNew; - gpu_sumInew[threadID] = - gpu_sumIref[threadID] - sumImaginaryOld + sumImaginaryNew; + gpu_sumRnew[threadID] = gpu_sumRref[threadID] + sumReal; + gpu_sumInew[threadID] = gpu_sumIref[threadID] + sumImaginary; - gpu_energyRecipNew[threadID] = + gpu_recipEnergies[threadID] = ((gpu_sumRnew[threadID] * gpu_sumRnew[threadID] + gpu_sumInew[threadID] * gpu_sumInew[threadID]) * gpu_prefactRef[threadID]); @@ -733,7 +734,7 @@ __global__ void ChangeLambdaMolReciprocalGPU( double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_kx, double *gpu_ky, double *gpu_kz, int atomNumber, double *gpu_particleCharge, double *gpu_sumRnew, double *gpu_sumInew, double *gpu_sumRref, - double *gpu_sumIref, double *gpu_prefactRef, double *gpu_energyRecipNew, + double *gpu_sumIref, double *gpu_prefactRef, double *gpu_recipEnergies, double lambdaCoef, int imageSize) { int threadID = blockIdx.x * blockDim.x + threadIdx.x; if (threadID >= imageSize) @@ -754,7 +755,7 @@ __global__ void ChangeLambdaMolReciprocalGPU( gpu_sumRnew[threadID] = gpu_sumRref[threadID] + lambdaCoef * sumRealNew; gpu_sumInew[threadID] = gpu_sumIref[threadID] + lambdaCoef * sumImaginaryNew; - gpu_energyRecipNew[threadID] = + gpu_recipEnergies[threadID] = ((gpu_sumRnew[threadID] * gpu_sumRnew[threadID] + gpu_sumInew[threadID] * gpu_sumInew[threadID]) * gpu_prefactRef[threadID]); diff --git a/src/GPU/CalculateEwaldCUDAKernel.cuh b/src/GPU/CalculateEwaldCUDAKernel.cuh index 16a3fc532..a4fb130f2 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cuh +++ b/src/GPU/CalculateEwaldCUDAKernel.cuh @@ -61,7 +61,7 @@ void CallMolReciprocalGPU(VariablesCUDA *vars, uint imageSize, double *sumRnew, double *sumInew, - double &energyRecipNew, + double &energyRecip, uint box); //Calculate reciprocal term for lambdaNew and Old with same coordinates @@ -71,7 +71,7 @@ void CallChangeLambdaMolReciprocalGPU(VariablesCUDA *vars, uint imageSize, double *sumRnew, double *sumInew, - double &energyRecipNew, + double &energyRecip, const double lambdaCoef, uint box); @@ -81,8 +81,10 @@ void CallSwapReciprocalGPU(VariablesCUDA *vars, uint imageSize, double *sumRnew, double *sumInew, + double *sumRref, + double *sumIref, const bool insert, - double &energyRecipNew, + double &energyRecip, uint box); void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, @@ -91,6 +93,12 @@ void CallMolExchangeReciprocalGPU(VariablesCUDA *vars, double *sumInew, uint box); +void CallMolExchangeReciprocalStartGPU(VariablesCUDA *vars, + uint imageSize, + double *sumRref, + double *sumIref, + uint box); + __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx, double *gpu_aForceRecy, double *gpu_aForceRecz, @@ -154,7 +162,7 @@ __global__ void MolReciprocalGPU(double *gpu_cx, double *gpu_cy, double *gpu_cz, double *gpu_sumRref, double *gpu_sumIref, double *gpu_prefactRef, - double *gpu_energyRecipNew, + double *gpu_RecipEnergies, int imageSize); __global__ void ChangeLambdaMolReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, @@ -166,7 +174,7 @@ __global__ void ChangeLambdaMolReciprocalGPU(double *gpu_x, double *gpu_y, doubl double *gpu_sumRref, double *gpu_sumIref, double *gpu_prefactRef, - double *gpu_energyRecipNew, + double *gpu_RecipEnergies, double lambdaCoef, int imageSize); @@ -180,13 +188,21 @@ __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_sumIref, double *gpu_prefactRef, const bool insert, - double *gpu_energyRecipNew, + double *gpu_RecipEnergies, + int imageSize); + +__global__ void NewSwapReciprocalGPU(VariablesCUDA *vars, + int atomNumber, + uint box, + double *gpu_particleCharge, + const bool insert, + double *gpu_RecipEnergies, int imageSize); __global__ void BoxReciprocalGPU(double *gpu_prefact, double *gpu_sumRnew, double *gpu_sumInew, - double *gpu_energyRecip, + double *gpu_RecipEnergies, int imageSize); #endif /*GOMC_CUDA*/ diff --git a/src/GPU/CalculateForceCUDAKernel.cu b/src/GPU/CalculateForceCUDAKernel.cu index c827a803d..155855107 100644 --- a/src/GPU/CalculateForceCUDAKernel.cu +++ b/src/GPU/CalculateForceCUDAKernel.cu @@ -43,7 +43,6 @@ void CallBoxInterForceGPU( int blocksPerGrid, threadsPerBlock; int energyVectorLen = 0; double *gpu_particleCharge; - double *gpu_final_value; // Run the kernel... threadsPerBlock = 128; @@ -64,7 +63,6 @@ void CallBoxInterForceGPU( particleCharge.size() * sizeof(double)); CUMALLOC((void **)&gpu_particleKind, particleKind.size() * sizeof(int)); CUMALLOC((void **)&gpu_particleMol, particleMol.size() * sizeof(int)); - CUMALLOC((void **)&gpu_final_value, sizeof(double)); if (electrostatic) { CUMALLOC((void **)&vars->gpu_rT11, energyVectorLen * sizeof(double)); CUMALLOC((void **)&vars->gpu_rT12, energyVectorLen * sizeof(double)); @@ -79,7 +77,9 @@ void CallBoxInterForceGPU( CUMALLOC((void **)&vars->gpu_vT22, energyVectorLen * sizeof(double)); CUMALLOC((void **)&vars->gpu_vT23, energyVectorLen * sizeof(double)); CUMALLOC((void **)&vars->gpu_vT33, energyVectorLen * sizeof(double)); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(vars->gpu_mapParticleToCell, &mapParticleToCell[0], atomNumber * sizeof(int), cudaMemcpyHostToDevice); @@ -132,53 +132,56 @@ void CallBoxInterForceGPU( sc_coul, sc_sigma_6, sc_alpha, sc_power, vars->gpu_rMin, vars->gpu_rMaxSq, vars->gpu_expConst, vars->gpu_molIndex, vars->gpu_lambdaVDW, vars->gpu_lambdaCoulomb, vars->gpu_isFraction, box); - checkLastErrorCUDA(__FILE__, __LINE__); +#ifndef NDEBUG cudaDeviceSynchronize(); + checkLastErrorCUDA(__FILE__, __LINE__); +#endif + // ReduceSum // Virial of LJ void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT11, - gpu_final_value, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); CUMALLOC(&d_temp_storage, temp_storage_bytes); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT11, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT11, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT11, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT12, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT12, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT12, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT13, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT13, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT13, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT22, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT22, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT22, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT23, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT23, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT23, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT33, - gpu_final_value, energyVectorLen); - cudaMemcpy(&vT33, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&vT33, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); if (electrostatic) { // ReduceSum // Virial of Coulomb DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT11, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT11, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT11, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT12, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT12, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT12, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT13, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT13, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT13, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT22, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT22, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT22, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT23, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT23, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT23, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT33, - gpu_final_value, energyVectorLen); - cudaMemcpy(&rT33, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + vars->gpu_finalVal, energyVectorLen); + cudaMemcpy(&rT33, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); } if (electrostatic) { @@ -199,7 +202,6 @@ void CallBoxInterForceGPU( CUFREE(gpu_particleKind); CUFREE(gpu_particleMol); CUFREE(gpu_particleCharge); - CUFREE(gpu_final_value); CUFREE(gpu_neighborList); CUFREE(gpu_cellStartIndex); } @@ -226,7 +228,6 @@ void CallBoxForceGPU(VariablesCUDA *vars, const std::vector &cellVector, int blocksPerGrid, threadsPerBlock, energyVectorLen; double *gpu_particleCharge; double *gpu_REn, *gpu_LJEn; - double *gpu_final_REn, *gpu_final_LJEn; double cpu_final_REn = 0.0, cpu_final_LJEn = 0.0; threadsPerBlock = 128; @@ -248,10 +249,8 @@ void CallBoxForceGPU(VariablesCUDA *vars, const std::vector &cellVector, CUMALLOC((void **)&gpu_particleKind, particleKind.size() * sizeof(int)); CUMALLOC((void **)&gpu_particleMol, particleMol.size() * sizeof(int)); CUMALLOC((void **)&gpu_LJEn, energyVectorLen * sizeof(double)); - CUMALLOC((void **)&gpu_final_LJEn, sizeof(double)); if (electrostatic) { CUMALLOC((void **)&gpu_REn, energyVectorLen * sizeof(double)); - CUMALLOC((void **)&gpu_final_REn, sizeof(double)); } // Copy necessary data to GPU @@ -311,25 +310,28 @@ void CallBoxForceGPU(VariablesCUDA *vars, const std::vector &cellVector, vars->gpu_mForcez, sc_coul, sc_sigma_6, sc_alpha, sc_power, vars->gpu_rMin, vars->gpu_rMaxSq, vars->gpu_expConst, vars->gpu_molIndex, vars->gpu_lambdaVDW, vars->gpu_lambdaCoulomb, vars->gpu_isFraction, box); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif + // LJ ReduceSum void *d_temp_storage = NULL; size_t temp_storage_bytes = 0; DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, - gpu_final_LJEn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes)); DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, - gpu_final_LJEn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); // Copy the result back to CPU ! :) - CubDebugExit(cudaMemcpy(&cpu_final_LJEn, gpu_final_LJEn, sizeof(double), + CubDebugExit(cudaMemcpy(&cpu_final_LJEn, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost)); if (electrostatic) { // Real Term ReduceSum DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn, - gpu_final_REn, energyVectorLen); + vars->gpu_finalVal, energyVectorLen); // Copy the result back to CPU ! :) - CubDebugExit(cudaMemcpy(&cpu_final_REn, gpu_final_REn, sizeof(double), + CubDebugExit(cudaMemcpy(&cpu_final_REn, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost)); } CUFREE(d_temp_storage); @@ -349,16 +351,16 @@ void CallBoxForceGPU(VariablesCUDA *vars, const std::vector &cellVector, cudaMemcpyDeviceToHost); cudaMemcpy(mForcez, vars->gpu_mForcez, sizeof(double) * molCount, cudaMemcpyDeviceToHost); +#ifndef NDEBUG cudaDeviceSynchronize(); +#endif CUFREE(gpu_particleCharge); CUFREE(gpu_particleKind); CUFREE(gpu_particleMol); CUFREE(gpu_LJEn); - CUFREE(gpu_final_LJEn); if (electrostatic) { CUFREE(gpu_REn); - CUFREE(gpu_final_REn); } CUFREE(gpu_neighborList); CUFREE(gpu_cellStartIndex); @@ -372,11 +374,9 @@ void CallVirialReciprocalGPU(VariablesCUDA *vars, XYZArray const ¤tCoords, uint imageSize, double constVal, uint box) { int atomNumber = currentCoords.Count(); double *gpu_particleCharge; - double *gpu_final_value; CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_final_value, sizeof(double)); cudaMemcpy(vars->gpu_x, currentCoords.x, atomNumber * sizeof(double), cudaMemcpyHostToDevice); @@ -418,33 +418,30 @@ void CallVirialReciprocalGPU(VariablesCUDA *vars, XYZArray const ¤tCoords, vars->gpu_sumRref[box], vars->gpu_sumIref[box], gpu_particleCharge, vars->gpu_rT11, vars->gpu_rT12, vars->gpu_rT13, vars->gpu_rT22, vars->gpu_rT23, vars->gpu_rT33, constVal, imageSize, atomNumber); - +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif + // ReduceSum // Virial of Reciprocal - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT11, - gpu_final_value, imageSize); - CUMALLOC(&d_temp_storage, temp_storage_bytes); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT11, - gpu_final_value, imageSize); - cudaMemcpy(&rT11, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT12, - gpu_final_value, imageSize); - cudaMemcpy(&rT12, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT13, - gpu_final_value, imageSize); - cudaMemcpy(&rT13, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT22, - gpu_final_value, imageSize); - cudaMemcpy(&rT22, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT23, - gpu_final_value, imageSize); - cudaMemcpy(&rT23, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT33, - gpu_final_value, imageSize); - cudaMemcpy(&rT33, gpu_final_value, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT11, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT11, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT12, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT12, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT13, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT13, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT22, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT22, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT23, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT23, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); + DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, vars->gpu_rT33, + vars->gpu_finalVal, imageSize); + cudaMemcpy(&rT33, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost); CUFREE(vars->gpu_rT11); CUFREE(vars->gpu_rT12); @@ -453,8 +450,6 @@ void CallVirialReciprocalGPU(VariablesCUDA *vars, XYZArray const ¤tCoords, CUFREE(vars->gpu_rT23); CUFREE(vars->gpu_rT33); CUFREE(gpu_particleCharge); - CUFREE(gpu_final_value); - CUFREE(d_temp_storage); } __global__ void BoxInterForceGPU( diff --git a/src/GPU/ConstantDefinitionsCUDAKernel.cu b/src/GPU/ConstantDefinitionsCUDAKernel.cu index eda129319..1395987ea 100644 --- a/src/GPU/ConstantDefinitionsCUDAKernel.cu +++ b/src/GPU/ConstantDefinitionsCUDAKernel.cu @@ -8,8 +8,9 @@ along with this program, also can be found at #ifdef GOMC_CUDA #include #include -#include +#include "cub/cub.cuh" +#include #include #include "CUDAMemoryManager.cuh" @@ -48,8 +49,9 @@ void InitGPUForceField(VariablesCUDA &vars, double const *sigmaSq, CUMALLOC((void **)&vars.gpu_alpha, BOX_TOTAL * sizeof(double)); CUMALLOC((void **)&vars.gpu_ewald, sizeof(int)); CUMALLOC((void **)&vars.gpu_diElectric_1, sizeof(double)); + CUMALLOC((void **)&vars.gpu_finalVal, sizeof(double)); - // allocate gpu memory for lambda variables + // allocate GPU memory for lambda variables CUMALLOC((void **)&vars.gpu_molIndex, (int)BOX_TOTAL * sizeof(int)); CUMALLOC((void **)&vars.gpu_lambdaVDW, (int)BOX_TOTAL * sizeof(double)); CUMALLOC((void **)&vars.gpu_lambdaCoulomb, (int)BOX_TOTAL * sizeof(double)); @@ -75,7 +77,9 @@ void InitGPUForceField(VariablesCUDA &vars, double const *sigmaSq, cudaMemcpy(vars.gpu_ewald, &ewald, sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(vars.gpu_diElectric_1, &diElectric_1, sizeof(double), cudaMemcpyHostToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void InitCoordinatesCUDA(VariablesCUDA *vars, uint atomNumber, @@ -138,7 +142,9 @@ void InitCoordinatesCUDA(VariablesCUDA *vars, uint atomNumber, CUMALLOC((void **)&vars->gpu_mForceRecz, maxMolNumber * sizeof(double)); CUMALLOC((void **)&vars->gpu_cellVector, atomNumber * sizeof(int)); CUMALLOC((void **)&vars->gpu_mapParticleToCell, atomNumber * sizeof(int)); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void InitExp6VariablesCUDA(VariablesCUDA *vars, double *rMin, double *expConst, @@ -153,7 +159,9 @@ void InitExp6VariablesCUDA(VariablesCUDA *vars, double *rMin, double *expConst, cudaMemcpyHostToDevice); cudaMemcpy(vars->gpu_expConst, expConst, size * sizeof(double), cudaMemcpyHostToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void InitEwaldVariablesCUDA(VariablesCUDA *vars, uint imageTotal) { @@ -189,7 +197,15 @@ void InitEwaldVariablesCUDA(VariablesCUDA *vars, uint imageTotal) { CUMALLOC((void **)&vars->gpu_hsqr[b], imageTotal * sizeof(double)); CUMALLOC((void **)&vars->gpu_hsqrRef[b], imageTotal * sizeof(double)); } + CUMALLOC((void **)&vars->gpu_recipEnergies, imageTotal * sizeof(double)); + //Allocate space for cub reduction operations on the Ewald arrays + //Set to the maximum value + cub::DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size, + vars->gpu_recipEnergies, vars->gpu_finalVal, imageTotal); + CUMALLOC(&vars->cub_reduce_storage, vars->cub_reduce_storage_size); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void CopyCurrentToRefCUDA(VariablesCUDA *vars, uint box, uint imageTotal) { @@ -207,7 +223,9 @@ void CopyCurrentToRefCUDA(VariablesCUDA *vars, uint box, uint imageTotal) { imageTotal * sizeof(double), cudaMemcpyDeviceToDevice); cudaMemcpy(vars->gpu_kzRef[box], vars->gpu_kz[box], imageTotal * sizeof(double), cudaMemcpyDeviceToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void CopyRefToNewCUDA(VariablesCUDA *vars, uint box, uint imageTotal) { @@ -215,7 +233,9 @@ void CopyRefToNewCUDA(VariablesCUDA *vars, uint box, uint imageTotal) { imageTotal * sizeof(double), cudaMemcpyDeviceToDevice); cudaMemcpy(vars->gpu_sumInew[box], vars->gpu_sumIref[box], imageTotal * sizeof(double), cudaMemcpyDeviceToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void UpdateRecipVecCUDA(VariablesCUDA *vars, uint box) { @@ -259,7 +279,9 @@ void UpdateCellBasisCUDA(VariablesCUDA *vars, uint box, double *cellBasis_x, cudaMemcpy(vars->gpu_cell_z[box], cellBasis_z, 3 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(vars->gpu_nonOrth, &nonOrth, sizeof(int), cudaMemcpyHostToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void UpdateInvCellBasisCUDA(VariablesCUDA *vars, uint box, @@ -273,7 +295,9 @@ void UpdateInvCellBasisCUDA(VariablesCUDA *vars, uint box, cudaMemcpy(vars->gpu_Invcell_z[box], invCellBasis_z, 3 * sizeof(double), cudaMemcpyHostToDevice); cudaMemcpy(vars->gpu_nonOrth, &nonOrth, sizeof(int), cudaMemcpyHostToDevice); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void DestroyEwaldCUDAVars(VariablesCUDA *vars) { @@ -293,6 +317,9 @@ void DestroyEwaldCUDAVars(VariablesCUDA *vars) { CUFREE(vars->gpu_hsqr[b]); CUFREE(vars->gpu_hsqrRef[b]); } + CUFREE(vars->gpu_recipEnergies); + CUFREE(vars->cub_reduce_storage); + delete[] vars->gpu_kx; delete[] vars->gpu_ky; delete[] vars->gpu_kz; @@ -329,6 +356,7 @@ void DestroyCUDAVars(VariablesCUDA *vars) { CUFREE(vars->gpu_alpha); CUFREE(vars->gpu_ewald); CUFREE(vars->gpu_diElectric_1); + CUFREE(vars->gpu_finalVal); CUFREE(vars->gpu_x); CUFREE(vars->gpu_y); CUFREE(vars->gpu_z); @@ -376,7 +404,7 @@ void DestroyCUDAVars(VariablesCUDA *vars) { CUFREE(vars->gpu_Invcell_z[b]); } - // delete gpu memory for lambda variables + // delete GPU memory for lambda variables CUFREE(vars->gpu_molIndex); CUFREE(vars->gpu_lambdaVDW); CUFREE(vars->gpu_lambdaCoulomb); diff --git a/src/GPU/TransformParticlesCUDAKernel.cu b/src/GPU/TransformParticlesCUDAKernel.cu index a17f0035b..b7c2cc75c 100644 --- a/src/GPU/TransformParticlesCUDAKernel.cu +++ b/src/GPU/TransformParticlesCUDAKernel.cu @@ -273,8 +273,10 @@ void CallTranslateParticlesGPU( cudaMemcpyHostToDevice); cudaMemcpy(vars->gpu_comz, newCOMs.z, molCount * sizeof(double), cudaMemcpyHostToDevice); - +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif + TranslateParticlesKernel<<>>( molCount, t_max, vars->gpu_mForcex, vars->gpu_mForcey, vars->gpu_mForcez, vars->gpu_inForceRange, step, key, seed, vars->gpu_x, vars->gpu_y, @@ -285,8 +287,10 @@ void CallTranslateParticlesGPU( lambdaBETA, vars->gpu_t_k_x, vars->gpu_t_k_y, vars->gpu_t_k_z, gpu_isMoleculeInvolved, vars->gpu_mForceRecx, vars->gpu_mForceRecy, vars->gpu_mForceRecz); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(newMolPos.x, vars->gpu_x, atomCount * sizeof(double), cudaMemcpyDeviceToHost); @@ -310,7 +314,9 @@ void CallTranslateParticlesGPU( cudaMemcpyDeviceToHost); CUFREE(gpu_isMoleculeInvolved); CUFREE(gpu_particleMol); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } void CallRotateParticlesGPU( @@ -362,8 +368,10 @@ void CallRotateParticlesGPU( vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], vars->gpu_nonOrth, lambdaBETA, vars->gpu_r_k_x, vars->gpu_r_k_y, vars->gpu_r_k_z, gpu_isMoleculeInvolved); +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(newMolPos.x, vars->gpu_x, atomCount * sizeof(double), cudaMemcpyDeviceToHost); @@ -381,7 +389,9 @@ void CallRotateParticlesGPU( cudaMemcpyDeviceToHost); CUFREE(gpu_isMoleculeInvolved); CUFREE(gpu_particleMol); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } __global__ void TranslateParticlesKernel( @@ -603,9 +613,10 @@ void BrownianMotionRotateParticlesGPU( vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], axis, halfAx, atomCount, r_max, step, key, seed, BETA); - +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(newMolPos.x, vars->gpu_x, atomCount * sizeof(double), cudaMemcpyDeviceToHost); @@ -620,7 +631,9 @@ void BrownianMotionRotateParticlesGPU( cudaMemcpy(r_k.z, vars->gpu_r_k_z, molCount * sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_moleculeInvolved); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } template @@ -819,9 +832,10 @@ void BrownianMotionTranslateParticlesGPU( vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], axis, halfAx, atomCount, t_max, step, key, seed, BETA); - +#ifndef NDEBUG cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); +#endif cudaMemcpy(newMolPos.x, vars->gpu_x, atomCount * sizeof(double), cudaMemcpyDeviceToHost); @@ -842,7 +856,9 @@ void BrownianMotionTranslateParticlesGPU( cudaMemcpy(t_k.z, vars->gpu_t_k_z, molCount * sizeof(double), cudaMemcpyDeviceToHost); CUFREE(gpu_moleculeInvolved); +#ifndef NDEBUG checkLastErrorCUDA(__FILE__, __LINE__); +#endif } template diff --git a/src/GPU/VariablesCUDA.cuh b/src/GPU/VariablesCUDA.cuh index 687b6e2c6..486b23642 100644 --- a/src/GPU/VariablesCUDA.cuh +++ b/src/GPU/VariablesCUDA.cuh @@ -26,6 +26,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = } } +#ifndef NDEBUG inline void checkLastErrorCUDA(const char *file, int line) { cudaError_t code = cudaGetLastError(); @@ -34,6 +35,7 @@ inline void checkLastErrorCUDA(const char *file, int line) exit(code); } } +#endif inline void printFreeMemory() { @@ -58,33 +60,40 @@ class VariablesCUDA public: VariablesCUDA() { - gpu_sigmaSq = NULL; - gpu_epsilon_Cn = NULL; - gpu_n = NULL; - gpu_VDW_Kind = NULL; - gpu_isMartini = NULL; - gpu_count = NULL; - gpu_rCut = NULL; - gpu_rCutLow = NULL; - gpu_rOn = NULL; - gpu_alpha = NULL; - gpu_rCutCoulomb = NULL; - gpu_ewald = NULL; - gpu_diElectric_1 = NULL; - gpu_aForcex = NULL; - gpu_aForcey = NULL; - gpu_aForcez = NULL; - gpu_mForcex = NULL; - gpu_mForcey = NULL; - gpu_mForcez = NULL; - gpu_startAtomIdx = NULL; + cub_reduce_storage_size = 0; + cub_reduce_storage = nullptr; + gpu_sigmaSq = nullptr; + gpu_epsilon_Cn = nullptr; + gpu_n = nullptr; + gpu_VDW_Kind = nullptr; + gpu_isMartini = nullptr; + gpu_count = nullptr; + gpu_startAtomIdx = nullptr; + gpu_rCut = nullptr; + gpu_rCutCoulomb = nullptr; + gpu_rCutLow = nullptr; + gpu_rOn = nullptr; + gpu_alpha = nullptr; + gpu_ewald = nullptr; + gpu_diElectric_1 = nullptr; + gpu_finalVal = nullptr; + gpu_aForcex = nullptr; + gpu_aForcey = nullptr; + gpu_aForcez = nullptr; + gpu_mForcex = nullptr; + gpu_mForcey = nullptr; + gpu_mForcez = nullptr; + gpu_startAtomIdx = nullptr; - // setting lambda values to null - gpu_molIndex = NULL; - gpu_lambdaVDW = NULL; - gpu_lambdaCoulomb = NULL; - gpu_isFraction = NULL; + // setting lambda valuesy to null + gpu_molIndex = nullptr; + gpu_lambdaVDW = nullptr; + gpu_lambdaCoulomb = nullptr; + gpu_isFraction = nullptr; } + + size_t cub_reduce_storage_size; + void *cub_reduce_storage; double *gpu_sigmaSq; double *gpu_epsilon_Cn; double *gpu_n; @@ -99,6 +108,7 @@ public: double *gpu_alpha; int *gpu_ewald; double *gpu_diElectric_1; + double *gpu_finalVal; double *gpu_x, *gpu_y, *gpu_z; double *gpu_nx, *gpu_ny, *gpu_nz; double *gpu_dx, *gpu_dy, *gpu_dz; @@ -107,6 +117,7 @@ public: double **gpu_sumRnew, **gpu_sumInew, **gpu_sumRref, **gpu_sumIref; double **gpu_prefact, **gpu_prefactRef; double **gpu_hsqr, **gpu_hsqrRef; + double *gpu_recipEnergies; double *gpu_comx, *gpu_comy, *gpu_comz; double *gpu_rT11, *gpu_rT12, *gpu_rT13; double *gpu_rT22, *gpu_rT23, *gpu_rT33;