Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/GPU-dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
maorz1998 committed Dec 23, 2023
2 parents 85bedbe + 362d74d commit 4ec0c1e
Show file tree
Hide file tree
Showing 41 changed files with 11,492 additions and 4,393 deletions.
137 changes: 119 additions & 18 deletions applications/solvers/dfLowMachFoam/EEqn.H
Original file line number Diff line number Diff line change
@@ -1,36 +1,137 @@
{
volScalarField& he = thermo.he();

#if defined GPUSolverNew_
double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);

EEqn_GPU.process();
EEqn_GPU.sync();
// EEqn_GPU.postProcess(h_he, h_boundary_he);

// copy h_he to he(cpu)
// memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);

//DEBUG_TRACE;
//he.correctBoundaryConditions();
//DEBUG_TRACE;

#if defined DEBUG_
fvScalarMatrix EEqn
(

fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
- dpdt
==
(
turbName == "laminar"
?
(
fvm::laplacian(turbulence->alpha(), he)
- diffAlphaD
+ fvc::div(hDiffCorrFlux)
)
:
(
fvm::laplacian(turbulence->alphaEff(), he)
)
)
);
// EEqn.relax();
EEqn.solve("ha");
// checkResult
// TODO: for temp, now we compare ldu, finally we compare csr
std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);

offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
offset += patchSize;
}
}

double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
offset = 0;
forAll(he.boundaryField(), patchi)
{
const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
int patchSize = patchHe.size();
if (patchHe.type() == "processor"
|| patchHe.type() == "processorCyclic") {
const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
offset += patchSize * 2;
} else {
memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
offset += patchSize;
}
}

bool printFlag = false;
int rank = -1;
if (mpi_init_flag) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}
if (!mpi_init_flag || rank == 0) {
// DEBUG_TRACE;
// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
// DEBUG_TRACE;
// EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
}

delete h_boundary_he_tmp;

#endif

#else
start1 = std::clock();
fvScalarMatrix EEqn
(

fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
- dpdt
==
fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+ fvc::ddt(rho, K) + fvc::div(phi, K)
- dpdt
==
(
turbName == "laminar"
?
(
fvm::laplacian(turbulence->alpha(), he)
- diffAlphaD
+ fvc::div(hDiffCorrFlux)
)
:
(
turbName == "laminar"
?
(
fvm::laplacian(turbulence->alpha(), he)
- diffAlphaD
+ fvc::div(hDiffCorrFlux)
)
:
(
fvm::laplacian(turbulence->alphaEff(), he)
)
fvm::laplacian(turbulence->alphaEff(), he)
)
);
)
);
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

EEqn.relax();
// EEqn.relax();
start1 = std::clock();
EEqn.solve("ha");
end1 = std::clock();
time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif
}
8 changes: 4 additions & 4 deletions applications/solvers/dfLowMachFoam/Make/options
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \
$(PFLAGS) $(PINC) \
$(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
$(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
$(if $(AMGX_DIR),-DGPUSolver_,) \
-I$(LIB_SRC)/transportModels/compressible/lnInclude \
-I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
-I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
Expand All @@ -28,7 +27,7 @@ EXE_INC = -std=c++14 \
$(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
$(PYTHON_INC_DIR) \
$(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
$(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
$(if $(AMGX_DIR), -I/usr/local/cuda/include,) \
$(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)

EXE_LIBS = \
Expand All @@ -50,6 +49,7 @@ EXE_LIBS = \
$(if $(LIBTORCH_ROOT),-lpthread,) \
$(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
$(if $(PYTHON_LIB_DIR),$(PYTHON_LIB_DIR),) \
$(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
$(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \
$(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \
$(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
$(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
$(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
166 changes: 145 additions & 21 deletions applications/solvers/dfLowMachFoam/UEqn.H
Original file line number Diff line number Diff line change
@@ -1,24 +1,148 @@
start1 = std::clock();
tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U) + fvm::div(phi, U)
+ turbulence->divDevRhoReff(U)
);
fvVectorMatrix& UEqn = tUEqn.ref();

end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

UEqn.relax();
start1 = std::clock();
if (pimple.momentumPredictor())
{
solve(UEqn == -fvc::grad(p));
// Solve the Momentum equation
#ifdef GPUSolverNew_

#if defined DEBUG_
// run CPU, for temp
TICK_START;
tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U)
+
fvm::div(phi, U)
+
turbulence->divDevRhoReff(U)
);
fvVectorMatrix& UEqn = tUEqn.ref();
TICK_STOP(CPU assembly time);

volTensorField gradU = fvc::grad(U);

double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
offset = 0;
forAll(U.boundaryField(), patchi)
{
const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
int patchsize = patchGradU.size();
if (patchGradU.type() == "processor"
|| patchGradU.type() == "processorCyclic") {
// print info
if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
Info << "gradU transform = true" << endl;
} else {
Info << "gradU transform = false" << endl;
}
Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;

memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
tensorField patchGradUInternal =
dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
offset += patchsize * 2;
} else {
memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
offset += patchsize;
}
}
#endif

// process
TICK_START;
UEqn_GPU.process();
UEqn_GPU.sync();
TICK_STOP(GPU process time);

// postProcess
// TICK_START;
// UEqn_GPU.postProcess(h_u);
// memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
// U.correctBoundaryConditions();
// K = 0.5*magSqr(U);
// DEBUG_TRACE;
// TICK_STOP(post process time);

#if defined DEBUG_
// UEqn.relax();
TICK_START;
solve(UEqn == -fvc::grad(p));
K.oldTime();
K = 0.5*magSqr(U);
}
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
TICK_STOP(CPU solve time);
// checkResult
// TODO: for temp, now we compare ldu, finally we compare csr
std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);

offset = 0;
for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
{
const fvPatchVectorField& patchU = U.boundaryField()[patchi];
int patchsize = dfDataBase.patch_size[patchi];
const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
else offset += patchsize;
}

double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
offset = 0;
forAll(U.boundaryField(), patchi)
{
const fvPatchVectorField& patchU = U.boundaryField()[patchi];
int patchsize = dfDataBase.patch_size[patchi];

if (patchU.type() == "processor"
|| patchU.type() == "processorCyclic") {
memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
vectorField patchUInternal =
dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
offset += 2 * patchsize;
} else {
memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
offset += patchsize;
}
}

bool printFlag = false;

int rank = -1;
if (mpi_init_flag) {
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
}

if (!mpi_init_flag || rank == 0) {
// UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
// h_internal_coeffs.data(), h_boundary_coeffs.data(),
// // &gradU[0][0], h_boundary_gradU,
// printFlag);
// UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
}
DEBUG_TRACE;
#endif

#else
start1 = std::clock();
tmp<fvVectorMatrix> tUEqn
(
fvm::ddt(rho, U) + fvm::div(phi, U)
+ turbulence->divDevRhoReff(U)
);
fvVectorMatrix& UEqn = tUEqn.ref();
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);

// UEqn.relax();
start1 = std::clock();
if (pimple.momentumPredictor())
{
solve(UEqn == -fvc::grad(p));
K.oldTime();
K = 0.5*magSqr(U);
}
end1 = std::clock();
time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
#endif
Loading

0 comments on commit 4ec0c1e

Please sign in to comment.