From 9136af83d3252d140eb8ae828c5161fa7e6d9269 Mon Sep 17 00:00:00 2001 From: LSchwiebert Date: Thu, 17 Sep 2020 17:22:33 -0400 Subject: [PATCH] Improve GPU performance especially without electrostatics --- src/GPU/CalculateEnergyCUDAKernel.cu | 57 +++++++++++++++++----------- src/GPU/CalculateForceCUDAKernel.cu | 54 +++++++++++++------------- 2 files changed, 61 insertions(+), 50 deletions(-) diff --git a/src/GPU/CalculateEnergyCUDAKernel.cu b/src/GPU/CalculateEnergyCUDAKernel.cu index 905fdbfce..46d9f2c61 100644 --- a/src/GPU/CalculateEnergyCUDAKernel.cu +++ b/src/GPU/CalculateEnergyCUDAKernel.cu @@ -65,11 +65,13 @@ void CallBoxInterGPU(VariablesCUDA *vars, CUMALLOC((void**) &gpu_particleCharge, particleCharge.size() * sizeof(double)); CUMALLOC((void**) &gpu_particleKind, particleKind.size() * sizeof(int)); CUMALLOC((void**) &gpu_particleMol, particleMol.size() * sizeof(int)); - CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double)); CUMALLOC((void**) &gpu_LJEn, energyVectorLen * sizeof(double)); - CUMALLOC((void**) &gpu_final_REn, sizeof(double)); CUMALLOC((void**) &gpu_final_LJEn, sizeof(double)); - + if (electrostatic) { + CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double)); + CUMALLOC((void**) &gpu_final_REn, sizeof(double)); + } + // Copy necessary data to GPU cudaMemcpy(gpu_neighborList, &neighborlist1D[0], neighborListCount * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(gpu_cellStartIndex, &cellStartIndex[0], cellStartIndex.size() * sizeof(int), cudaMemcpyHostToDevice); @@ -85,9 +87,9 @@ void CallBoxInterGPU(VariablesCUDA *vars, boxAxes.GetAxis(box).y, boxAxes.GetAxis(box).z); - double3 halfAx = make_double3(boxAxes.GetAxis(box).x / 2.0, - boxAxes.GetAxis(box).y / 2.0, - boxAxes.GetAxis(box).z / 2.0); + double3 halfAx = make_double3(boxAxes.GetAxis(box).x * 0.5, + boxAxes.GetAxis(box).y * 0.5, + boxAxes.GetAxis(box).z * 0.5); BoxInterGPU <<< blocksPerGrid, threadsPerBlock>>>(gpu_cellStartIndex, vars->gpu_cellVector, @@ -143,28 +145,37 @@ void CallBoxInterGPU(VariablesCUDA *vars, // ReduceSum void * d_temp_storage = NULL; size_t temp_storage_bytes = 0; - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn, - gpu_final_REn, energyVectorLen); - CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes)); - DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn, - gpu_final_REn, energyVectorLen); // LJ ReduceSum DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, gpu_final_LJEn, energyVectorLen); - CUFREE(d_temp_storage); + CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes)); + DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn, + gpu_final_LJEn, energyVectorLen); // Copy back the result to CPU ! :) - CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double), - cudaMemcpyDeviceToHost)); CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double), cudaMemcpyDeviceToHost)); + if (electrostatic) { + // Real Term ReduceSum + DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn, + gpu_final_REn, energyVectorLen); + // Copy back the result to CPU ! :) + CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double), + cudaMemcpyDeviceToHost)); + } + else { + REn = 0.0; + } + CUFREE(d_temp_storage); CUFREE(gpu_particleCharge); CUFREE(gpu_particleKind); CUFREE(gpu_particleMol); - CUFREE(gpu_REn); CUFREE(gpu_LJEn); - CUFREE(gpu_final_REn); CUFREE(gpu_final_LJEn); + if (electrostatic) { + CUFREE(gpu_REn); + CUFREE(gpu_final_REn); + } CUFREE(gpu_neighborList); CUFREE(gpu_cellStartIndex); } @@ -247,7 +258,7 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex, if(currentParticle < neighborParticle && gpu_particleMol[currentParticle] != gpu_particleMol[neighborParticle]) { // Check if they are within rcut - double distSq = 0; + double distSq = 0.0; if(InRcutGPU(distSq, gpu_x, gpu_y, gpu_z, currentParticle, neighborParticle, @@ -264,6 +275,10 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex, double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction, gpu_molIndex, gpu_kindIndex, gpu_lambdaVDW); + LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn, + gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0], + gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6, + sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst); if(electrostatic) { static const double qqFact = 167000.0; @@ -278,14 +293,12 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex, sc_sigma_6, sc_alpha, sc_power, gpu_sigmaSq, gpu_count[0]); } - LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn, - gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0], - gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6, - sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst); } } } - gpu_REn[threadID] = REn; + if(electrostatic) { + gpu_REn[threadID] = REn; + } gpu_LJEn[threadID] = LJEn; } diff --git a/src/GPU/CalculateForceCUDAKernel.cu b/src/GPU/CalculateForceCUDAKernel.cu index fabda6e93..b3fe7c21c 100644 --- a/src/GPU/CalculateForceCUDAKernel.cu +++ b/src/GPU/CalculateForceCUDAKernel.cu @@ -664,16 +664,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex, virComponents = make_double3(0.0, 0.0, 0.0); - double pRF = 0.0, qi_qj, pVF = 0.0; - double lambdaVDW = 0.0, lambdaCoulomb = 0.0; int threadID = blockIdx.x * blockDim.x + threadIdx.x; - //tensors for VDW and real part of electrostatic + //tensors for VDW gpu_vT11[threadID] = 0.0, gpu_vT22[threadID] = 0.0, gpu_vT33[threadID] = 0.0; // extra tensors reserved for later on gpu_vT12[threadID] = 0.0, gpu_vT13[threadID] = 0.0, gpu_vT23[threadID] = 0.0; if(electrostatic) { - //tensors for VDW and real part of electrostatic + //tensors for real part of electrostatic gpu_rT11[threadID] = 0.0, gpu_rT22[threadID] = 0.0, gpu_rT33[threadID] = 0.0; // extra tensors reserved for later on gpu_rT12[threadID] = 0.0, gpu_rT13[threadID] = 0.0, gpu_rT23[threadID] = 0.0; @@ -720,36 +718,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex, int mA = gpu_particleMol[currentParticle]; int mB = gpu_particleMol[neighborParticle]; - lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction, + double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction, gpu_molIndex, gpu_kindIndex, gpu_lambdaVDW); diff_com = Difference(gpu_comx, gpu_comy, gpu_comz, mA, mB); diff_com = MinImageGPU(diff_com, axis, halfAx); - if(electrostatic) { - qi_qj = cA * cB; - lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box, - gpu_isFraction, gpu_molIndex, - gpu_kindIndex, gpu_lambdaCoulomb); - pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0], - gpu_isMartini[0], gpu_alpha[box], - gpu_rCutCoulomb[box], gpu_diElectric_1[0], - gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha, - sc_power, lambdaCoulomb, gpu_count[0], - kA, kB); - - gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x); - gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y); - gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z); - - //extra tensor calculations - gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x)); - gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x)); - gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y)); - } - - pVF = CalcEnForceGPU(distSq, kA, kB, + double pVF = CalcEnForceGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn, gpu_rCut[0], gpu_rOn[0], gpu_isMartini[0], gpu_VDW_Kind[0], gpu_count[0], lambdaVDW, sc_sigma_6, sc_alpha, @@ -763,6 +739,28 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex, gpu_vT12[threadID] += pVF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x)); gpu_vT13[threadID] += pVF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x)); gpu_vT23[threadID] += pVF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y)); + + if(electrostatic) { + double qi_qj = cA * cB; + double lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box, + gpu_isFraction, gpu_molIndex, + gpu_kindIndex, gpu_lambdaCoulomb); + double pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0], + gpu_isMartini[0], gpu_alpha[box], + gpu_rCutCoulomb[box], gpu_diElectric_1[0], + gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha, + sc_power, lambdaCoulomb, gpu_count[0], + kA, kB); + + gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x); + gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y); + gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z); + + //extra tensor calculations + gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x)); + gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x)); + gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y)); + } } } }