Skip to content

Commit

Permalink
Improve GPU performance especially without electrostatics
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Sep 17, 2020
1 parent a6ad643 commit 9136af8
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 50 deletions.
57 changes: 35 additions & 22 deletions src/GPU/CalculateEnergyCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@ void CallBoxInterGPU(VariablesCUDA *vars,
CUMALLOC((void**) &gpu_particleCharge, particleCharge.size() * sizeof(double));
CUMALLOC((void**) &gpu_particleKind, particleKind.size() * sizeof(int));
CUMALLOC((void**) &gpu_particleMol, particleMol.size() * sizeof(int));
CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double));
CUMALLOC((void**) &gpu_LJEn, energyVectorLen * sizeof(double));
CUMALLOC((void**) &gpu_final_REn, sizeof(double));
CUMALLOC((void**) &gpu_final_LJEn, sizeof(double));

if (electrostatic) {
CUMALLOC((void**) &gpu_REn, energyVectorLen * sizeof(double));
CUMALLOC((void**) &gpu_final_REn, sizeof(double));
}

// Copy necessary data to GPU
cudaMemcpy(gpu_neighborList, &neighborlist1D[0], neighborListCount * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(gpu_cellStartIndex, &cellStartIndex[0], cellStartIndex.size() * sizeof(int), cudaMemcpyHostToDevice);
Expand All @@ -85,9 +87,9 @@ void CallBoxInterGPU(VariablesCUDA *vars,
boxAxes.GetAxis(box).y,
boxAxes.GetAxis(box).z);

double3 halfAx = make_double3(boxAxes.GetAxis(box).x / 2.0,
boxAxes.GetAxis(box).y / 2.0,
boxAxes.GetAxis(box).z / 2.0);
double3 halfAx = make_double3(boxAxes.GetAxis(box).x * 0.5,
boxAxes.GetAxis(box).y * 0.5,
boxAxes.GetAxis(box).z * 0.5);

BoxInterGPU <<< blocksPerGrid, threadsPerBlock>>>(gpu_cellStartIndex,
vars->gpu_cellVector,
Expand Down Expand Up @@ -143,28 +145,37 @@ void CallBoxInterGPU(VariablesCUDA *vars,
// ReduceSum
void * d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
gpu_final_REn, energyVectorLen);
CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
gpu_final_REn, energyVectorLen);
// LJ ReduceSum
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
gpu_final_LJEn, energyVectorLen);
CUFREE(d_temp_storage);
CubDebugExit(CUMALLOC(&d_temp_storage, temp_storage_bytes));
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_LJEn,
gpu_final_LJEn, energyVectorLen);
// Copy back the result to CPU ! :)
CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
cudaMemcpyDeviceToHost));
CubDebugExit(cudaMemcpy(&LJEn, gpu_final_LJEn, sizeof(double),
cudaMemcpyDeviceToHost));
if (electrostatic) {
// Real Term ReduceSum
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, gpu_REn,
gpu_final_REn, energyVectorLen);
// Copy back the result to CPU ! :)
CubDebugExit(cudaMemcpy(&REn, gpu_final_REn, sizeof(double),
cudaMemcpyDeviceToHost));
}
else {
REn = 0.0;
}
CUFREE(d_temp_storage);

CUFREE(gpu_particleCharge);
CUFREE(gpu_particleKind);
CUFREE(gpu_particleMol);
CUFREE(gpu_REn);
CUFREE(gpu_LJEn);
CUFREE(gpu_final_REn);
CUFREE(gpu_final_LJEn);
if (electrostatic) {
CUFREE(gpu_REn);
CUFREE(gpu_final_REn);
}
CUFREE(gpu_neighborList);
CUFREE(gpu_cellStartIndex);
}
Expand Down Expand Up @@ -247,7 +258,7 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,

if(currentParticle < neighborParticle && gpu_particleMol[currentParticle] != gpu_particleMol[neighborParticle]) {
// Check if they are within rcut
double distSq = 0;
double distSq = 0.0;

if(InRcutGPU(distSq, gpu_x, gpu_y, gpu_z,
currentParticle, neighborParticle,
Expand All @@ -264,6 +275,10 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,

double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
gpu_molIndex, gpu_kindIndex, gpu_lambdaVDW);
LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn,
gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0],
gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6,
sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst);

if(electrostatic) {
static const double qqFact = 167000.0;
Expand All @@ -278,14 +293,12 @@ __global__ void BoxInterGPU(int *gpu_cellStartIndex,
sc_sigma_6, sc_alpha, sc_power, gpu_sigmaSq,
gpu_count[0]);
}
LJEn += CalcEnGPU(distSq, kA, kB, gpu_sigmaSq, gpu_n, gpu_epsilon_Cn,
gpu_VDW_Kind[0], gpu_isMartini[0], gpu_rCut[0],
gpu_rOn[0], gpu_count[0], lambdaVDW, sc_sigma_6,
sc_alpha, sc_power, gpu_rMin, gpu_rMaxSq, gpu_expConst);
}
}
}
gpu_REn[threadID] = REn;
if(electrostatic) {
gpu_REn[threadID] = REn;
}
gpu_LJEn[threadID] = LJEn;
}

Expand Down
54 changes: 26 additions & 28 deletions src/GPU/CalculateForceCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -664,16 +664,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,

virComponents = make_double3(0.0, 0.0, 0.0);

double pRF = 0.0, qi_qj, pVF = 0.0;
double lambdaVDW = 0.0, lambdaCoulomb = 0.0;
int threadID = blockIdx.x * blockDim.x + threadIdx.x;
//tensors for VDW and real part of electrostatic
//tensors for VDW
gpu_vT11[threadID] = 0.0, gpu_vT22[threadID] = 0.0, gpu_vT33[threadID] = 0.0;
// extra tensors reserved for later on
gpu_vT12[threadID] = 0.0, gpu_vT13[threadID] = 0.0, gpu_vT23[threadID] = 0.0;

if(electrostatic) {
//tensors for VDW and real part of electrostatic
//tensors for real part of electrostatic
gpu_rT11[threadID] = 0.0, gpu_rT22[threadID] = 0.0, gpu_rT33[threadID] = 0.0;
// extra tensors reserved for later on
gpu_rT12[threadID] = 0.0, gpu_rT13[threadID] = 0.0, gpu_rT23[threadID] = 0.0;
Expand Down Expand Up @@ -720,36 +718,14 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,
int mA = gpu_particleMol[currentParticle];
int mB = gpu_particleMol[neighborParticle];

lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
double lambdaVDW = DeviceGetLambdaVDW(mA, kA, mB, kB, box, gpu_isFraction,
gpu_molIndex, gpu_kindIndex,
gpu_lambdaVDW);

diff_com = Difference(gpu_comx, gpu_comy, gpu_comz, mA, mB);
diff_com = MinImageGPU(diff_com, axis, halfAx);

if(electrostatic) {
qi_qj = cA * cB;
lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box,
gpu_isFraction, gpu_molIndex,
gpu_kindIndex, gpu_lambdaCoulomb);
pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0],
gpu_isMartini[0], gpu_alpha[box],
gpu_rCutCoulomb[box], gpu_diElectric_1[0],
gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha,
sc_power, lambdaCoulomb, gpu_count[0],
kA, kB);

gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x);
gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y);
gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z);

//extra tensor calculations
gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));
}

pVF = CalcEnForceGPU(distSq, kA, kB,
double pVF = CalcEnForceGPU(distSq, kA, kB,
gpu_sigmaSq, gpu_n, gpu_epsilon_Cn, gpu_rCut[0],
gpu_rOn[0], gpu_isMartini[0], gpu_VDW_Kind[0],
gpu_count[0], lambdaVDW, sc_sigma_6, sc_alpha,
Expand All @@ -763,6 +739,28 @@ __global__ void BoxInterForceGPU(int *gpu_cellStartIndex,
gpu_vT12[threadID] += pVF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
gpu_vT13[threadID] += pVF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
gpu_vT23[threadID] += pVF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));

if(electrostatic) {
double qi_qj = cA * cB;
double lambdaCoulomb = DeviceGetLambdaCoulomb(mA, kA, mB, kB, box,
gpu_isFraction, gpu_molIndex,
gpu_kindIndex, gpu_lambdaCoulomb);
double pRF = CalcCoulombForceGPU(distSq, qi_qj, gpu_VDW_Kind[0], gpu_ewald[0],
gpu_isMartini[0], gpu_alpha[box],
gpu_rCutCoulomb[box], gpu_diElectric_1[0],
gpu_sigmaSq, sc_coul, sc_sigma_6, sc_alpha,
sc_power, lambdaCoulomb, gpu_count[0],
kA, kB);

gpu_rT11[threadID] += pRF * (virComponents.x * diff_com.x);
gpu_rT22[threadID] += pRF * (virComponents.y * diff_com.y);
gpu_rT33[threadID] += pRF * (virComponents.z * diff_com.z);

//extra tensor calculations
gpu_rT12[threadID] += pRF * (0.5 * (virComponents.x * diff_com.y + virComponents.y * diff_com.x));
gpu_rT13[threadID] += pRF * (0.5 * (virComponents.x * diff_com.z + virComponents.z * diff_com.x));
gpu_rT23[threadID] += pRF * (0.5 * (virComponents.y * diff_com.z + virComponents.z * diff_com.y));
}
}
}
}
Expand Down

0 comments on commit 9136af8

Please sign in to comment.