Separate the GPU and CPU code in the .H header files under the dfLowM…

…achFoam solver.
deepmodeling · Apr 7, 2024 · 44217db · 44217db
1 parent 51d8df4
commit 44217db
Show file tree

Hide file tree

Showing 5 changed files with 686 additions and 0 deletions.
diff --git a/applications/solvers/dfLowMachFoam/EEqn_GPU.H b/applications/solvers/dfLowMachFoam/EEqn_GPU.H
@@ -0,0 +1,101 @@
+{
+    volScalarField& he = thermo.he();
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+
+    EEqn_GPU.process();
+    EEqn_GPU.sync();
+    // EEqn_GPU.postProcess(h_he, h_boundary_he);
+
+    // copy h_he to he(cpu)
+    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
+
+    //DEBUG_TRACE;
+    //he.correctBoundaryConditions();
+    //DEBUG_TRACE;
+
+#if defined DEBUG_
+    fvScalarMatrix EEqn
+    (
+
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
+            (
+                fvm::laplacian(turbulence->alphaEff(), he)
+            )
+        )
+    );
+    // EEqn.relax();
+    EEqn.solve("ha");
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+		// DEBUG_TRACE;
+		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
+		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+		// DEBUG_TRACE;
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+    }
+
+    delete h_boundary_he_tmp;
+
+#endif
+}
diff --git a/applications/solvers/dfLowMachFoam/UEqn_GPU.H b/applications/solvers/dfLowMachFoam/UEqn_GPU.H
@@ -0,0 +1,122 @@
+// Solve the Momentum equation
+#if defined DEBUG_
+    // run CPU, for temp
+    TICK_START;
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    TICK_STOP(CPU assembly time);
+
+    volTensorField gradU = fvc::grad(U);
+
+    double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
+        int patchsize = patchGradU.size();
+        if (patchGradU.type() == "processor"
+                || patchGradU.type() == "processorCyclic") {
+            // print info
+            if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
+                Info << "gradU transform = true" << endl;
+            } else {
+                Info << "gradU transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;
+
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            tensorField patchGradUInternal = 
+                    dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
+            memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize;
+        }
+    }
+#endif
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // postProcess
+    // TICK_START;
+    // UEqn_GPU.postProcess(h_u);
+    // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
+    // U.correctBoundaryConditions();
+    // K = 0.5*magSqr(U);
+    // DEBUG_TRACE;
+    // TICK_STOP(post process time);
+
+#if defined DEBUG_
+    // UEqn.relax();
+    TICK_START;
+    solve(UEqn == -fvc::grad(p));
+    K.oldTime();
+    K = 0.5*magSqr(U);
+    TICK_STOP(CPU solve time);
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
+        else offset += patchsize;
+    }
+
+    double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
+            offset += 2 * patchsize;
+        } else {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+
+    bool printFlag = false;
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+
+    if (!mpi_init_flag || rank == 0) {
+        // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+        //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        //     // &gradU[0][0], h_boundary_gradU,
+        //     printFlag);
+        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+    }
+    DEBUG_TRACE;
+#endif
+