Skip to content

Commit

Permalink
Ali's patch to reduce cudamallocs/cudafrees in BoxInterForce
Browse files Browse the repository at this point in the history
  • Loading branch information
LSchwiebert committed Oct 18, 2024
1 parent 8c67339 commit 5d4138e
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 63 deletions.
88 changes: 27 additions & 61 deletions src/GPU/CalculateForceCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,7 @@ void CallBoxInterForceGPU(

CUMALLOC((void **)&gpu_neighborList, numberOfCellPairs * sizeof(int));
CUMALLOC((void **)&gpu_cellStartIndex, cellStartIndex.size() * sizeof(int));
if (electrostatic) {
CUMALLOC((void **)&vars->gpu_rT11, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT12, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT13, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT22, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT23, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT33, energyVectorLen * sizeof(double));
}
CUMALLOC((void **)&vars->gpu_vT11, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT12, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT13, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT22, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT23, energyVectorLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT33, energyVectorLen * sizeof(double));
UpdateEnergyVecs(vars, energyVectorLen, electrostatic);
#ifndef NDEBUG
checkLastErrorCUDA(__FILE__, __LINE__);
#endif
Expand Down Expand Up @@ -97,8 +84,8 @@ void CallBoxInterForceGPU(
boxAxes.GetAxis(box).z);

double3 halfAx =
make_double3(boxAxes.GetAxis(box).x / 2.0, boxAxes.GetAxis(box).y / 2.0,
boxAxes.GetAxis(box).z / 2.0);
make_double3(boxAxes.GetAxis(box).x * 0.5, boxAxes.GetAxis(box).y * 0.5,
boxAxes.GetAxis(box).z * 0.5);

BoxInterForceGPU<<<blocksPerGrid, threadsPerBlock>>>(
gpu_cellStartIndex, vars->gpu_cellVector, gpu_neighborList, numberOfCells,
Expand All @@ -122,74 +109,53 @@ void CallBoxInterForceGPU(
checkLastErrorCUDA(__FILE__, __LINE__);
#endif

// ReduceSum // Virial of LJ
void *d_temp_storage = NULL;
size_t temp_storage_bytes = 0;
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT11,
vars->gpu_finalVal, energyVectorLen);
CUMALLOC(&d_temp_storage, temp_storage_bytes);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT11,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT11, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT11, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT12,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT12, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT12, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT13,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT13, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT13, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT22,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT22, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT22, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT23,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT23, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT23, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_vT33,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_vT33, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&vT33, vars->gpu_finalVal, sizeof(double), cudaMemcpyDeviceToHost);

if (electrostatic) {
// ReduceSum // Virial of Coulomb
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT11,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT11, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT11, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT12,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT12, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT12, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT13,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT13, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT13, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT22,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT22, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT22, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT23,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT23, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT23, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, vars->gpu_rT33,
vars->gpu_finalVal, energyVectorLen);
DeviceReduce::Sum(vars->cub_reduce_storage, vars->cub_reduce_storage_size,
vars->gpu_rT33, vars->gpu_finalVal, energyVectorLen);
cudaMemcpy(&rT33, vars->gpu_finalVal, sizeof(double),
cudaMemcpyDeviceToHost);
}

if (electrostatic) {
CUFREE(vars->gpu_rT11);
CUFREE(vars->gpu_rT12);
CUFREE(vars->gpu_rT13);
CUFREE(vars->gpu_rT22);
CUFREE(vars->gpu_rT23);
CUFREE(vars->gpu_rT33);
}
CUFREE(vars->gpu_vT11);
CUFREE(vars->gpu_vT12);
CUFREE(vars->gpu_vT13);
CUFREE(vars->gpu_vT22);
CUFREE(vars->gpu_vT23);
CUFREE(vars->gpu_vT33);
CUFREE(d_temp_storage);
CUFREE(gpu_neighborList);
CUFREE(gpu_cellStartIndex);
}
Expand Down
40 changes: 38 additions & 2 deletions src/GPU/ConstantDefinitionsCUDAKernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -337,14 +337,38 @@ void UpdateEnergyVecs(VariablesCUDA *vars, int newVecLen, bool electrostatic) {
// Free the current allocations if this isn't the first allocation
if (vars->gpu_energyVecLen > 0) {
CUFREE(vars->gpu_LJEn);
CUFREE(vars->gpu_vT11);
CUFREE(vars->gpu_vT12);
CUFREE(vars->gpu_vT13);
CUFREE(vars->gpu_vT22);
CUFREE(vars->gpu_vT23);
CUFREE(vars->gpu_vT33);
if (electrostatic) {
CUFREE(vars->gpu_REn);
CUFREE(vars->gpu_rT11);
CUFREE(vars->gpu_rT12);
CUFREE(vars->gpu_rT13);
CUFREE(vars->gpu_rT22);
CUFREE(vars->gpu_rT23);
CUFREE(vars->gpu_rT33);
}
}
vars->gpu_energyVecLen = newVecLen;
CUMALLOC((void **)&vars->gpu_LJEn, vars->gpu_energyVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_LJEn, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT11, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT12, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT13, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT22, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT23, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_vT33, newVecLen * sizeof(double));
if (electrostatic) {
CUMALLOC((void **)&vars->gpu_REn, vars->gpu_energyVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_REn, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT11, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT12, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT13, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT22, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT23, newVecLen * sizeof(double));
CUMALLOC((void **)&vars->gpu_rT33, newVecLen * sizeof(double));
}

// Check if more temporary storage is needed for this larger reduction size.
Expand Down Expand Up @@ -445,6 +469,18 @@ void DestroyCUDAVars(VariablesCUDA *vars) {
CUFREE(vars->gpu_comz);
CUFREE(vars->gpu_LJEn);
CUFREE(vars->gpu_REn);
CUFREE(vars->gpu_rT11);
CUFREE(vars->gpu_rT12);
CUFREE(vars->gpu_rT13);
CUFREE(vars->gpu_rT22);
CUFREE(vars->gpu_rT23);
CUFREE(vars->gpu_rT33);
CUFREE(vars->gpu_vT11);
CUFREE(vars->gpu_vT12);
CUFREE(vars->gpu_vT13);
CUFREE(vars->gpu_vT22);
CUFREE(vars->gpu_vT23);
CUFREE(vars->gpu_vT33);
CUFREE(vars->gpu_r_k_x);
CUFREE(vars->gpu_r_k_y);
CUFREE(vars->gpu_r_k_z);
Expand Down
18 changes: 18 additions & 0 deletions src/GPU/VariablesCUDA.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,24 @@ public:
gpu_energyVecLen = 0;
gpu_LJEn = nullptr;
gpu_REn = nullptr;
gpu_rT11 = nullptr;
gpu_rT12 = nullptr;
gpu_rT13 = nullptr;
gpu_rT22 = nullptr;
gpu_rT23 = nullptr;
gpu_rT33 = nullptr;
gpu_vT11 = nullptr;
gpu_vT12 = nullptr;
gpu_vT13 = nullptr;
gpu_vT22 = nullptr;
gpu_vT23 = nullptr;
gpu_vT33 = nullptr;
gpu_wT11 = nullptr;
gpu_wT12 = nullptr;
gpu_wT13 = nullptr;
gpu_wT22 = nullptr;
gpu_wT23 = nullptr;
gpu_wT33 = nullptr;

// setting lambda values to nullptr
gpu_molIndex = nullptr;
Expand Down

0 comments on commit 5d4138e

Please sign in to comment.