From 7d5721a7a096be02b565293e05683b52d82bf085 Mon Sep 17 00:00:00 2001 From: Loren Schwiebert Date: Tue, 27 Aug 2024 19:57:30 -0400 Subject: [PATCH] Change BoxForceReciprocalGPU implementation to eliminate roundoff --- src/Ewald.cpp | 26 ++--- src/GPU/CalculateEwaldCUDAKernel.cu | 162 ++++++++++++--------------- src/GPU/CalculateEwaldCUDAKernel.cuh | 11 +- 3 files changed, 91 insertions(+), 108 deletions(-) diff --git a/src/Ewald.cpp b/src/Ewald.cpp index de23f1bc7..6dcd353a6 100644 --- a/src/Ewald.cpp +++ b/src/Ewald.cpp @@ -1501,11 +1501,8 @@ void Ewald::BoxForceReciprocal(XYZArray const &molCoords, double constValue = ff.alpha[box] * M_2_SQRTPI; #ifdef GOMC_CUDA - bool *particleUsed; - particleUsed = new bool[atomForceRec.Count()]; - memset((void *)particleUsed, false, atomForceRec.Count() * sizeof(bool)); + std::vector particleUsed; #if ENSEMBLE == GEMC || ENSEMBLE == GCMC - memset((void *)particleUsed, false, atomForceRec.Count() * sizeof(bool)); MoleculeLookup::box_iterator thisMol = molLookup.BoxBegin(box); MoleculeLookup::box_iterator end = molLookup.BoxEnd(box); while (thisMol != end) { @@ -1514,25 +1511,28 @@ void Ewald::BoxForceReciprocal(XYZArray const &molCoords, uint start = mols.MolStart(molIndex); for (uint p = 0; p < length; p++) { atomForceRec.Set(start + p, 0.0, 0.0, 0.0); - particleUsed[start + p] = true; + // Need to include only the charged atoms + if (particleCharge[start + p] != 0.0) + particleUsed.push_back(start + p); } molForceRec.Set(molIndex, 0.0, 0.0, 0.0); thisMol++; } #else // Only one box, so clear all atoms and molecules and mark all particles as - // Used + // used atomForceRec.Reset(); molForceRec.Reset(); - memset((void *)particleUsed, true, atomForceRec.Count() * sizeof(bool)); + for (auto i; i < atomForceRec.Count(); ++i) { + particleUsed.push_back(i); + } #endif - CallBoxForceReciprocalGPU( - ff.particles->getCUDAVars(), atomForceRec, molForceRec, particleCharge, - particleMol, particleHasNoCharge, particleUsed, startMol, lengthMol, - ff.alpha[box], ff.alphaSq[box], constValue, imageSizeRef[box], - molCoords, currentAxes, box); - delete[] particleUsed; + CallBoxForceReciprocalGPU(ff.particles->getCUDAVars(), atomForceRec, + molForceRec, particleCharge, particleMol, + particleUsed, startMol, lengthMol, ff.alpha[box], + ff.alphaSq[box], constValue, imageSizeRef[box], + molCoords, currentAxes, box); #else // molecule iterator MoleculeLookup::box_iterator thisMol = molLookup.BoxBegin(box); diff --git a/src/GPU/CalculateEwaldCUDAKernel.cu b/src/GPU/CalculateEwaldCUDAKernel.cu index 5812426b6..c1f02d756 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cu +++ b/src/GPU/CalculateEwaldCUDAKernel.cu @@ -23,6 +23,7 @@ using namespace cub; #define IMAGES_PER_BLOCK 64 #define PARTICLES_PER_BLOCK 64 +#define THREADS_PER_BLOCK 128 #define FULL_MASK 0xffffffff @@ -448,7 +449,7 @@ void CallBoxForceReciprocalGPU( VariablesCUDA *vars, XYZArray &atomForceRec, XYZArray &molForceRec, const std::vector &particleCharge, const std::vector &particleMol, - const std::vector &particleHasNoCharge, const bool *particleUsed, + const std::vector &particleUsed, const std::vector &startMol, const std::vector &lengthMol, double alpha, double alphaSq, double constValue, uint imageSize, XYZArray const &molCoords, BoxDimensions const &boxAxes, int box) { @@ -456,27 +457,15 @@ void CallBoxForceReciprocalGPU( int molCount = molForceRec.Count(); double *gpu_particleCharge; int *gpu_particleMol; - bool *gpu_particleHasNoCharge, *gpu_particleUsed; - bool *arr_particleHasNoCharge = new bool[particleHasNoCharge.size()]; - int *gpu_startMol, *gpu_lengthMol; - - // particleHasNoCharge is stored in vector, so in order to copy it to - // GPU it needs to be stored in bool[]. because: std::vector : Does not - // necessarily store its elements as a contiguous array - for (int i = 0; i < particleHasNoCharge.size(); i++) { - arr_particleHasNoCharge[i] = particleHasNoCharge[i]; - } + int *gpu_particleUsed, *gpu_startMol, *gpu_lengthMol; // calculate block and grid sizes - dim3 threadsPerBlock(256, 1, 1); - int blocksPerGridX = (int)(atomCount / threadsPerBlock.x) + 1; - int blocksPerGridY = (int)(imageSize / IMAGES_PER_BLOCK) + 1; - dim3 blocksPerGrid(blocksPerGridX, blocksPerGridY, 1); + // calculate block and grid sizes + int threadsPerBlock = THREADS_PER_BLOCK; + int blocksPerGrid = particleUsed.size(); CUMALLOC((void **)&gpu_particleCharge, particleCharge.size() * sizeof(double)); - CUMALLOC((void **)&gpu_particleHasNoCharge, - particleHasNoCharge.size() * sizeof(bool)); CUMALLOC((void **)&gpu_particleUsed, atomCount * sizeof(bool)); CUMALLOC((void **)&gpu_startMol, startMol.size() * sizeof(int)); CUMALLOC((void **)&gpu_lengthMol, lengthMol.size() * sizeof(int)); @@ -498,9 +487,7 @@ void CallBoxForceReciprocalGPU( sizeof(double) * particleCharge.size(), cudaMemcpyHostToDevice); cudaMemcpy(gpu_particleMol, &particleMol[0], sizeof(int) * particleMol.size(), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_particleHasNoCharge, arr_particleHasNoCharge, - sizeof(bool) * particleHasNoCharge.size(), cudaMemcpyHostToDevice); - cudaMemcpy(gpu_particleUsed, particleUsed, sizeof(bool) * atomCount, + cudaMemcpy(gpu_particleUsed, &particleUsed[0], sizeof(int) * particleUsed.size(), cudaMemcpyHostToDevice); cudaMemcpy(vars->gpu_x, molCoords.x, sizeof(double) * atomCount, cudaMemcpyHostToDevice); @@ -517,16 +504,16 @@ void CallBoxForceReciprocalGPU( BoxForceReciprocalGPU<<>>( vars->gpu_aForceRecx, vars->gpu_aForceRecy, vars->gpu_aForceRecz, vars->gpu_mForceRecx, vars->gpu_mForceRecy, vars->gpu_mForceRecz, - gpu_particleCharge, gpu_particleMol, gpu_particleHasNoCharge, - gpu_particleUsed, gpu_startMol, gpu_lengthMol, alpha, alphaSq, constValue, - imageSize, vars->gpu_kxRef[box], vars->gpu_kyRef[box], - vars->gpu_kzRef[box], vars->gpu_x, vars->gpu_y, vars->gpu_z, - vars->gpu_prefactRef[box], vars->gpu_sumRnew[box], vars->gpu_sumInew[box], - vars->gpu_isFraction, vars->gpu_molIndex, vars->gpu_lambdaCoulomb, - vars->gpu_cell_x[box], vars->gpu_cell_y[box], vars->gpu_cell_z[box], - vars->gpu_Invcell_x[box], vars->gpu_Invcell_y[box], - vars->gpu_Invcell_z[box], vars->gpu_nonOrth, boxAxes.GetAxis(box).x, - boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z, box, atomCount); + gpu_particleCharge, gpu_particleMol, gpu_particleUsed, gpu_startMol, + gpu_lengthMol, alpha, alphaSq, constValue, imageSize, + vars->gpu_kxRef[box], vars->gpu_kyRef[box], vars->gpu_kzRef[box], + vars->gpu_x, vars->gpu_y, vars->gpu_z, vars->gpu_prefactRef[box], + vars->gpu_sumRnew[box], vars->gpu_sumInew[box], vars->gpu_isFraction, + vars->gpu_molIndex, vars->gpu_lambdaCoulomb, vars->gpu_cell_x[box], + vars->gpu_cell_y[box], vars->gpu_cell_z[box], vars->gpu_Invcell_x[box], + vars->gpu_Invcell_y[box], vars->gpu_Invcell_z[box], vars->gpu_nonOrth, + boxAxes.GetAxis(box).x, boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z, + box); cudaDeviceSynchronize(); checkLastErrorCUDA(__FILE__, __LINE__); @@ -544,9 +531,7 @@ void CallBoxForceReciprocalGPU( cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); - delete[] arr_particleHasNoCharge; CUFREE(gpu_particleCharge); - CUFREE(gpu_particleHasNoCharge); CUFREE(gpu_particleUsed); CUFREE(gpu_startMol); CUFREE(gpu_lengthMol); @@ -556,70 +541,55 @@ void CallBoxForceReciprocalGPU( __global__ void BoxForceReciprocalGPU( double *gpu_aForceRecx, double *gpu_aForceRecy, double *gpu_aForceRecz, double *gpu_mForceRecx, double *gpu_mForceRecy, double *gpu_mForceRecz, - double *gpu_particleCharge, int *gpu_particleMol, - bool *gpu_particleHasNoCharge, bool *gpu_particleUsed, int *gpu_startMol, - int *gpu_lengthMol, double alpha, double alphaSq, double constValue, - int imageSize, double *gpu_kx, double *gpu_ky, double *gpu_kz, - double *gpu_x, double *gpu_y, double *gpu_z, double *gpu_prefact, - double *gpu_sumRnew, double *gpu_sumInew, bool *gpu_isFraction, - int *gpu_molIndex, double *gpu_lambdaCoulomb, double *gpu_cell_x, - double *gpu_cell_y, double *gpu_cell_z, double *gpu_Invcell_x, - double *gpu_Invcell_y, double *gpu_Invcell_z, int *gpu_nonOrth, double axx, - double axy, double axz, int box, int atomCount) { - __shared__ double shared_kvector[IMAGES_PER_BLOCK * 3]; - int particleID = blockDim.x * blockIdx.x + threadIdx.x; - int offset_vector_index = blockIdx.y * IMAGES_PER_BLOCK; - int numberOfVectors = min(IMAGES_PER_BLOCK, imageSize - offset_vector_index); - - if (threadIdx.x < numberOfVectors) { - shared_kvector[threadIdx.x * 3] = gpu_kx[offset_vector_index + threadIdx.x]; - shared_kvector[threadIdx.x * 3 + 1] = - gpu_ky[offset_vector_index + threadIdx.x]; - shared_kvector[threadIdx.x * 3 + 2] = - gpu_kz[offset_vector_index + threadIdx.x]; + double *gpu_particleCharge, int *gpu_particleMol, const int *gpu_particleUsed, + int *gpu_startMol, int *gpu_lengthMol, double alpha, double alphaSq, + double constValue, int imageSize, double *gpu_kx, double *gpu_ky, + double *gpu_kz, double *gpu_x, double *gpu_y, double *gpu_z, + double *gpu_prefact, double *gpu_sumRnew, double *gpu_sumInew, + bool *gpu_isFraction, int *gpu_molIndex, double *gpu_lambdaCoulomb, + double *gpu_cell_x, double *gpu_cell_y, double *gpu_cell_z, + double *gpu_Invcell_x, double *gpu_Invcell_y, double *gpu_Invcell_z, + int *gpu_nonOrth, double axx, double axy, double axz, int box) { + + __shared__ int particleID, moleculeID; + __shared__ double x, y, z, lambdaCoef, fixed; + + if (threadIdx.x == 0) { + // The particleID is the atom that corresponds to this particleUsed entry + particleID = gpu_particleUsed[blockIdx.x]; + moleculeID = gpu_particleMol[particleID]; + x = gpu_x[particleID]; + y = gpu_y[particleID]; + z = gpu_z[particleID]; + lambdaCoef = DeviceGetLambdaCoulomb(moleculeID, box, gpu_isFraction, + gpu_molIndex, gpu_lambdaCoulomb); + fixed = 2.0 * lambdaCoef * gpu_particleCharge[particleID]; } + __syncthreads(); - if (particleID >= atomCount || !gpu_particleUsed[particleID]) - return; double forceX = 0.0, forceY = 0.0, forceZ = 0.0; - int moleculeID = gpu_particleMol[particleID]; - if (gpu_particleHasNoCharge[particleID]) - return; - - double x = gpu_x[particleID]; - double y = gpu_y[particleID]; - double z = gpu_z[particleID]; - double lambdaCoef = DeviceGetLambdaCoulomb(moleculeID, box, gpu_isFraction, - gpu_molIndex, gpu_lambdaCoulomb); - - __syncthreads(); // loop over images - for (int vectorIndex = 0; vectorIndex < numberOfVectors; vectorIndex++) { - double dot = x * shared_kvector[vectorIndex * 3] + - y * shared_kvector[vectorIndex * 3 + 1] + - z * shared_kvector[vectorIndex * 3 + 2]; + for (int image = threadIdx.x; image < imageSize; image += THREADS_PER_BLOCK) { + double dot = x * gpu_kx[image] + y * gpu_ky[image] + z * gpu_kz[image]; double dotsin, dotcos; sincos(dot, &dotsin, &dotcos); - double factor = 2.0 * gpu_particleCharge[particleID] * - gpu_prefact[offset_vector_index + vectorIndex] * - lambdaCoef * - (dotsin * gpu_sumRnew[offset_vector_index + vectorIndex] - - dotcos * gpu_sumInew[offset_vector_index + vectorIndex]); - - forceX += factor * shared_kvector[vectorIndex * 3]; - forceY += factor * shared_kvector[vectorIndex * 3 + 1]; - forceZ += factor * shared_kvector[vectorIndex * 3 + 2]; + double factor = fixed * gpu_prefact[image] * (dotsin * gpu_sumRnew[image] - + dotcos * gpu_sumInew[image]); + forceX += factor * gpu_kx[image]; + forceY += factor * gpu_ky[image]; + forceZ += factor * gpu_kz[image]; } // loop over other particles within the same molecule - if (blockIdx.y == 0) { + // Pick the thread most likely to exit the for loop early + if (threadIdx.x == THREADS_PER_BLOCK-1) { double intraForce = 0.0, distSq = 0.0, dist = 0.0; double3 distVect; int lastParticleWithinSameMolecule = gpu_startMol[particleID] + gpu_lengthMol[particleID]; for (int otherParticle = gpu_startMol[particleID]; - otherParticle < lastParticleWithinSameMolecule; otherParticle++) { + otherParticle < lastParticleWithinSameMolecule; ++otherParticle) { if (particleID != otherParticle) { DeviceInRcut(distSq, distVect, gpu_x, gpu_y, gpu_z, particleID, otherParticle, axx, axy, axz, *gpu_nonOrth, gpu_cell_x, @@ -631,20 +601,36 @@ __global__ void BoxForceReciprocalGPU( double qiqj = gpu_particleCharge[particleID] * gpu_particleCharge[otherParticle] * qqFactGPU; intraForce = qiqj * lambdaCoef * lambdaCoef / distSq; - intraForce *= ((erf(alpha * dist) / dist) - constValue * expConstValue); + intraForce *= (erf(alpha * dist) / dist) - constValue * expConstValue; forceX -= intraForce * distVect.x; forceY -= intraForce * distVect.y; forceZ -= intraForce * distVect.z; } } } + __syncthreads(); - atomicAdd(&gpu_aForceRecx[particleID], forceX); - atomicAdd(&gpu_aForceRecy[particleID], forceY); - atomicAdd(&gpu_aForceRecz[particleID], forceZ); - atomicAdd(&gpu_mForceRecx[moleculeID], forceX); - atomicAdd(&gpu_mForceRecy[moleculeID], forceY); - atomicAdd(&gpu_mForceRecz[moleculeID], forceZ); + // Specialize BlockReduce for a 1D block of threads of type double + using BlockReduce = cub::BlockReduce; + + // Allocate shared memory for BlockReduce + __shared__ typename BlockReduce::TempStorage forceX_temp_storage; + __shared__ typename BlockReduce::TempStorage forceY_temp_storage; + __shared__ typename BlockReduce::TempStorage forceZ_temp_storage; + + // Compute the block-wide sum for thread 0 + double aggregateX = BlockReduce(forceX_temp_storage).Sum(forceX); + double aggregateY = BlockReduce(forceY_temp_storage).Sum(forceY); + double aggregateZ = BlockReduce(forceZ_temp_storage).Sum(forceZ); + + if (threadIdx.x == 0) { + gpu_aForceRecx[particleID] = aggregateX; + gpu_aForceRecy[particleID] = aggregateY; + gpu_aForceRecz[particleID] = aggregateZ; + atomicAdd(&gpu_mForceRecx[moleculeID], aggregateX); + atomicAdd(&gpu_mForceRecy[moleculeID], aggregateY); + atomicAdd(&gpu_mForceRecz[moleculeID], aggregateZ); + } } __global__ void SwapReciprocalGPU(double *gpu_x, double *gpu_y, double *gpu_z, diff --git a/src/GPU/CalculateEwaldCUDAKernel.cuh b/src/GPU/CalculateEwaldCUDAKernel.cuh index 16a3fc532..15b20b42c 100644 --- a/src/GPU/CalculateEwaldCUDAKernel.cuh +++ b/src/GPU/CalculateEwaldCUDAKernel.cuh @@ -19,8 +19,7 @@ void CallBoxForceReciprocalGPU(VariablesCUDA *vars, XYZArray &molForceRec, const std::vector &particleCharge, const std::vector &particleMol, - const std::vector &particleHasNoCharge, - const bool *particleUsed, + const std::vector &particleUsed, const std::vector &startMol, const std::vector &lengthMol, double alpha, @@ -99,9 +98,8 @@ __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx, double *gpu_mForceRecz, double *gpu_particleCharge, int *gpu_particleMol, - bool *gpu_particleHasNoCharge, - bool *gpu_particleUsed, - int *gpu_startMol, + const int *gpu_particleUsed, + int *gpu_startMol, int *gpu_lengthMol, double alpha, double alphaSq, @@ -129,8 +127,7 @@ __global__ void BoxForceReciprocalGPU(double *gpu_aForceRecx, double axx, double axy, double axz, - int box, - int atomCount); + int box); __global__ void BoxReciprocalSumsGPU(double * gpu_x, double * gpu_y,