From 44217dbc3d5c58fb0135d2dd3d8d3f57f27c5c6e Mon Sep 17 00:00:00 2001
From: uptonwu <uptonwu@163.com>
Date: Sun, 7 Apr 2024 15:13:09 +0800
Subject: [PATCH] Separate the GPU and CPU code in the .H header files under
 the dfLowMachFoam solver.

---
 applications/solvers/dfLowMachFoam/EEqn_GPU.H | 101 +++++++
 applications/solvers/dfLowMachFoam/UEqn_GPU.H | 122 +++++++++
 applications/solvers/dfLowMachFoam/YEqn_GPU.H | 247 ++++++++++++++++++
 applications/solvers/dfLowMachFoam/pEqn.H     | 129 +++++++++
 .../solvers/dfLowMachFoam/rhoEqn_GPU.H        |  87 ++++++
 5 files changed, 686 insertions(+)
 create mode 100644 applications/solvers/dfLowMachFoam/EEqn_GPU.H
 create mode 100644 applications/solvers/dfLowMachFoam/UEqn_GPU.H
 create mode 100644 applications/solvers/dfLowMachFoam/YEqn_GPU.H
 create mode 100644 applications/solvers/dfLowMachFoam/pEqn.H
 create mode 100644 applications/solvers/dfLowMachFoam/rhoEqn_GPU.H
diff --git a/applications/solvers/dfLowMachFoam/EEqn_GPU.H b/applications/solvers/dfLowMachFoam/EEqn_GPU.H
new file mode 100644
index 000000000..ad118b3a9
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/EEqn_GPU.H
@@ -0,0 +1,101 @@
+{
+    volScalarField& he = thermo.he();
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+
+    EEqn_GPU.process();
+    EEqn_GPU.sync();
+    // EEqn_GPU.postProcess(h_he, h_boundary_he);
+
+    // copy h_he to he(cpu)
+    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
+
+    //DEBUG_TRACE;
+    //he.correctBoundaryConditions();
+    //DEBUG_TRACE;
+
+#if defined DEBUG_
+    fvScalarMatrix EEqn
+    (
+
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
+            (
+                fvm::laplacian(turbulence->alphaEff(), he)
+            )
+        )
+    );
+    // EEqn.relax();
+    EEqn.solve("ha");
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+		// DEBUG_TRACE;
+		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
+		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+		// DEBUG_TRACE;
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+    }
+
+    delete h_boundary_he_tmp;
+
+#endif
+}
diff --git a/applications/solvers/dfLowMachFoam/UEqn_GPU.H b/applications/solvers/dfLowMachFoam/UEqn_GPU.H
new file mode 100644
index 000000000..8859f41ba
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/UEqn_GPU.H
@@ -0,0 +1,122 @@
+// Solve the Momentum equation
+#if defined DEBUG_
+    // run CPU, for temp
+    TICK_START;
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    TICK_STOP(CPU assembly time);
+
+    volTensorField gradU = fvc::grad(U);
+
+    double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
+        int patchsize = patchGradU.size();
+        if (patchGradU.type() == "processor"
+                || patchGradU.type() == "processorCyclic") {
+            // print info
+            if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
+                Info << "gradU transform = true" << endl;
+            } else {
+                Info << "gradU transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;
+
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            tensorField patchGradUInternal = 
+                    dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
+            memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize;
+        }
+    }
+#endif
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // postProcess
+    // TICK_START;
+    // UEqn_GPU.postProcess(h_u);
+    // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
+    // U.correctBoundaryConditions();
+    // K = 0.5*magSqr(U);
+    // DEBUG_TRACE;
+    // TICK_STOP(post process time);
+
+#if defined DEBUG_
+    // UEqn.relax();
+    TICK_START;
+    solve(UEqn == -fvc::grad(p));
+    K.oldTime();
+    K = 0.5*magSqr(U);
+    TICK_STOP(CPU solve time);
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
+        else offset += patchsize;
+    }
+
+    double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
+            offset += 2 * patchsize;
+        } else {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+
+    bool printFlag = false;
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    
+    if (!mpi_init_flag || rank == 0) {
+        // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+        //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        //     // &gradU[0][0], h_boundary_gradU,
+        //     printFlag);
+        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+    }
+    DEBUG_TRACE;
+#endif
+
diff --git a/applications/solvers/dfLowMachFoam/YEqn_GPU.H b/applications/solvers/dfLowMachFoam/YEqn_GPU.H
new file mode 100644
index 000000000..85d1bb779
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/YEqn_GPU.H
@@ -0,0 +1,247 @@
+#if defined DEBUG_
+    hDiffCorrFlux = Zero;
+    diffAlphaD = Zero;
+    sumYDiffError = Zero;
+
+    tmp<fv::convectionScheme<scalar>> mvConvection
+    (
+        fv::convectionScheme<scalar>::New
+        (
+            mesh,
+            fields,
+            phi,
+            mesh.divScheme("div(phi,Yi_h)")
+        )
+    );
+    // auto& mgcs = dynamic_cast<fv::multivariateGaussConvectionScheme<scalar>&>(mvConvection.ref());
+    // tmp<surfaceInterpolationScheme<scalar>> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]);
+    // tmp<surfaceScalarField> tweights = tinterpScheme_().weights(Y[0]);
+    // const surfaceScalarField& weights = tweights();
+    // Info << "CPU weights\n" << weights << endl;
+
+    // auto& limitedScheme_ = dynamic_cast<const limitedSurfaceInterpolationScheme<scalar>&>(tinterpScheme_());
+    // Info << "CPU limiter\n" << limitedScheme_.limiter(Y[0]) << endl;
+
+    forAll(Y, i)
+    {
+        sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
+    }
+    const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
+#endif
+
+//MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+label flag_mpi_init;
+MPI_Initialized(&flag_mpi_init);
+if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+
+{
+#if defined DEBUG_
+    // run CPU
+    volScalarField Yt(0.0*Y[0]);
+    int speciesIndex = 0;
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
+        diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
+        if (i != inertIndex)
+        {
+            start1 = std::clock();
+            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+
+            #ifdef ODE_GPU_SOLVER
+            fvScalarMatrix YiEqn
+            (
+                fvm::ddt(rho, Yi)
+            +
+                (
+                    turbName == "laminar"
+                    ?  (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi))
+                    :   mvConvection->fvmDiv(phi, Yi)
+                )
+            ==
+                (
+                    splitting
+                    ?   fvm::laplacian(DEff(), Yi)
+                    :  (fvm::laplacian(DEff(), Yi) + RR_GPU[i])
+                    )
+            );
+            #else
+            fvScalarMatrix YiEqn
+            (
+                fvm::ddt(rho, Yi)
+            +
+                (
+                    turbName == "laminar"
+                    ?  (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi))
+                    :   mvConvection->fvmDiv(phi, Yi)
+                )
+            ==
+                (
+                    splitting
+                    ?   fvm::laplacian(DEff(), Yi)
+                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                    )
+            );
+            #endif
+            
+            end1 = std::clock();
+            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+            // YiEqn.relax();
+
+            start1 = std::clock();
+            YiEqn.solve("Yi");
+            end1 = std::clock();
+            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+            Yi.max(0.0);
+            Yt += Yi;
+            ++speciesIndex;
+        }
+    }
+    Y[inertIndex] = scalar(1) - Yt;
+    Y[inertIndex].max(0.0);
+
+    int specie_index = 0;
+
+    // should compute grad_yi before YiEqn.solve()
+    const volVectorField grad_yi = fvc::grad(Y[specie_index]);
+
+    tmp<volScalarField> DEff = chemistry->rhoD(specie_index) + turbulence->mut()/Sct;
+    fvScalarMatrix YiEqn
+        (
+         fvm::ddt(rho, Y[specie_index])
+         + mvConvection->fvmDiv(phi, Y[specie_index])
+         + mvConvection->fvmDiv(phiUc, Y[specie_index])
+         ==
+         fvm::laplacian(DEff(), Y[specie_index])
+        );
+    // YiEqn.relax();
+    // YiEqn.solve("Yi");
+    // Y[specie_index].max(0.0);
+#endif
+
+    // process
+    YEqn_GPU.process();
+    YEqn_GPU.sync();
+
+#if defined DEBUG_
+    std::vector<double> h_boundary_diffAlphaD;
+    std::vector<double> h_boundary_grad_yi;
+    std::vector<double> h_boundary_sumYDiffError;
+    std::vector<double> h_boundary_hDiffCorrFlux;
+    std::vector<double> h_boundary_phiUc;
+    h_boundary_diffAlphaD.resize(dfDataBase.num_boundary_surfaces);
+    h_boundary_grad_yi.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_sumYDiffError.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_hDiffCorrFlux.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_phiUc.resize(dfDataBase.num_boundary_surfaces);
+    offset = 0;
+    forAll(diffAlphaD.boundaryField(), patchi)
+    {
+        //const scalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi];
+        const fvPatchScalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi];
+        const fvPatchVectorField& patchgradyi = grad_yi.boundaryField()[patchi];
+        const fvPatchVectorField& patchsumYDiffError = sumYDiffError.boundaryField()[patchi];
+        const fvPatchVectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi];
+        const fvsPatchScalarField& patchphiUc = phiUc.boundaryField()[patchi];
+        int patchSize = patchdiffAlphaD.size();
+        if (patchdiffAlphaD.type() == "processor"
+            || patchdiffAlphaD.type() == "processorCyclic") {
+            scalarField patchdiffAlphaDInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchdiffAlphaD).patchInternalField()();
+            vectorField patchgradyiInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchgradyi).patchInternalField()();
+            vectorField patchsumYDiffErrorInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchsumYDiffError).patchInternalField()();
+            vectorField patchhDiffCorrFluxInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchhDiffCorrFlux).patchInternalField()();
+            memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double));
+            memcpy(h_boundary_diffAlphaD.data() + offset + patchSize, &patchdiffAlphaDInternal[0], patchSize*sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + (offset + patchSize) * 3, &patchgradyiInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + (offset + patchSize) * 3, &patchsumYDiffErrorInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + (offset + patchSize) * 3, &patchhDiffCorrFluxInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
+    }
+    DEBUG_TRACE;
+    // YEqn_GPU.comparediffAlphaD(&diffAlphaD[0], h_boundary_diffAlphaD.data(), false);
+    // YEqn_GPU.comparegradyi(&grad_yi[0][0], h_boundary_grad_yi.data(), specie_index, false);
+    // YEqn_GPU.comparesumYDiffError(&sumYDiffError[0][0], h_boundary_sumYDiffError.data(), false);
+    // YEqn_GPU.comparehDiffCorrFlux(&hDiffCorrFlux[0][0], h_boundary_hDiffCorrFlux.data(), false);
+    // YEqn_GPU.comparephiUc(&phiUc[0], h_boundary_phiUc.data(), false);
+    DEBUG_TRACE;
+
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> yeqn_h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> yeqn_h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(Y[specie_index].boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchYi = Y[specie_index].boundaryField()[patchi];
+        int patchsize = patchYi.size();
+        const double* internal_coeff_ptr = &YiEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &YiEqn.boundaryCoeffs()[patchi][0];
+        if (patchYi.type() == "processor"
+            || patchYi.type() == "processorCyclic") {
+            memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double));
+            memset(yeqn_h_internal_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double));
+            memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double));
+            memset(yeqn_h_boundary_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double));
+            memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    // NOTE: ldu and yi can't be compared at the same time
+    // to compare ldu data, you should open both DEBUG_ and DEBUG_CHECK_LDU in src_gpu
+    // to compare yi, you should only open DEBUG_ in src_gpu.
+    // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same.
+    //DEBUG_TRACE;
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // YEqn_GPU.compareResult(&YiEqn.lower()[0], &YiEqn.upper()[0], &YiEqn.diag()[0], &YiEqn.source()[0],
+        //         yeqn_h_internal_coeffs.data(), yeqn_h_boundary_coeffs.data(), printFlag);
+    }
+
+    DEBUG_TRACE;
+    // YEqn_GPU.compareYi(&Y[specie_index][0], specie_index, false);
+    // DEBUG_TRACE;
+#endif
+
+    // postProcess
+    double *h_y = dfDataBase.getFieldPointer("y", location::cpu, position::internal);
+    double *h_boundary_y = dfDataBase.getFieldPointer("y", location::cpu, position::boundary);
+    // YEqn_GPU.postProcess(h_y, h_boundary_y);
+    DEBUG_TRACE;
+
+    // copy h_y to Y(cpu)
+    // offset = 0;
+    // forAll(Y, i)
+    // {
+    //     volScalarField& Yi = Y[i];
+    //     memcpy(&Yi[0], h_y + offset, Yi.size() * sizeof(double));
+    //     offset += Yi.size();
+    //     Yi.correctBoundaryConditions();
+    // }
+    DEBUG_TRACE;
+
+    fflush(stderr);
+   
+}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
new file mode 100644
index 000000000..aa379dde7
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/pEqn.H
@@ -0,0 +1,129 @@
+if (!pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Thermodynamic density needs to be updated by psi*d(p) after the
+// pressure solution
+const volScalarField psip0(psi*p);
+
+volScalarField rAU(1.0/UEqn.A());
+surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+
+if (pimple.nCorrPiso() <= 1)
+{
+    tUEqn.clear();
+}
+
+surfaceScalarField phiHbyA
+(
+    "phiHbyA",
+    fvc::interpolate(rho)*fvc::flux(HbyA)
+  + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
+);
+
+fvc::makeRelative(phiHbyA, rho, U);
+
+label flag_mpi_init;
+MPI_Initialized(&flag_mpi_init);
+if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+
+// Update the pressure BCs to ensure flux consistency
+constrainPressure(p, rho, U, phiHbyA, rhorAUf);
+//start = std::clock();
+if (pimple.transonic())
+{
+    surfaceScalarField phid
+    (
+        "phid",
+        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
+    );
+
+    phiHbyA -= fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
+
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA) + fvm::div(phid, p)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+
+        // Relax the pressure equation to ensure diagonal-dominance
+        pEqn.relax();
+
+        pEqn.solve();
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+else
+{
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + 
+        psi*correction(fvm::ddt(p))
+        + fvc::div(phiHbyA)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+        
+ 
+        pEqn.solve();
+
+        
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+
+}
+
+
+bool limitedp = pressureControl.limit(p);
+
+// Thermodynamic density update
+thermo.correctRho(psi*p - psip0);
+
+
+if (limitedp)
+{
+    rho = thermo.rho();
+}
+
+#include "rhoEqn.H"
+#include "compressibleContinuityErrs.H"
+
+// Explicitly relax pressure for momentum corrector
+// p.relax();
+
+U = HbyA - rAU*fvc::grad(p);
+U.correctBoundaryConditions();
+K = 0.5*magSqr(U);
+
+if (pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Correct rhoUf if the mesh is moving
+fvc::correctRhoUf(rhoUf, rho, U, phi);
+
+if (thermo.dpdt())
+{
+    dpdt = fvc::ddt(p);
+
+    if (mesh.moving())
+    {
+        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
+    }
+}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn_GPU.H b/applications/solvers/dfLowMachFoam/rhoEqn_GPU.H
new file mode 100644
index 000000000..4aa794f8a
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/rhoEqn_GPU.H
@@ -0,0 +1,87 @@
+/*---------------------------------------------------------------------------*\
+  =========                 |
+  \\      /  F ield         | OpenFOAM: The Open Source CFD Toolbox
+   \\    /   O peration     | Website:  https://openfoam.org
+    \\  /    A nd           | Copyright (C) 2011-2018 OpenFOAM Foundation
+     \\/     M anipulation  |
+-------------------------------------------------------------------------------
+License
+    This file is part of OpenFOAM.
+
+    OpenFOAM is free software: you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OpenFOAM is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+    FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+    for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with OpenFOAM.  If not, see <http://www.gnu.org/licenses/>.
+
+Global
+    rhoEqn
+
+Description
+    Solve the continuity for density.
+
+\*---------------------------------------------------------------------------*/
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+
+    TICK_START;
+    rhoEqn_GPU.process();
+    rhoEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // rhoEqn_GPU.postProcess(h_rho);
+    // rho.oldTime();
+    // memcpy(&rho[0], h_rho, dfDataBase.cell_value_bytes);
+    // rho.correctBoundaryConditions();
+
+
+#ifdef DEBUG_
+    // checkValue
+    TICK_START;
+    fvScalarMatrix rhoEqn
+    (
+        fvm::ddt(rho)
+      + fvc::div(phi)
+    );
+    rhoEqn.solve();
+    TICK_STOP(CPU process time);
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+
+    // if (!mpi_init_flag || rank == 0) {
+    //     rhoEqn_GPU.compareResult(&rhoEqn.diag()[0], &rhoEqn.source()[0], false);
+    // }
+
+    offset = 0;
+    forAll(rho.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        int patchsize = patchRho.size();
+        if (patchRho.type() == "processor"
+            || patchRho.type() == "processorCyclic") {
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            scalarField patchRhoInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).patchInternalField()();
+            memcpy(h_boundary_rho + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    // if (!mpi_init_flag || rank == 0) {
+    //     rhoEqn_GPU.compareRho(&rho[0], h_boundary_rho, false);
+    // }
+#endif
+
+// ************************************************************************* //