diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
index aac84fab9..46f59b798 100644
--- a/applications/solvers/dfLowMachFoam/EEqn.H
+++ b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,36 +1,137 @@
 {
     volScalarField& he = thermo.he();
-    
+#if defined GPUSolverNew_
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+
+    EEqn_GPU.process();
+    EEqn_GPU.sync();
+    // EEqn_GPU.postProcess(h_he, h_boundary_he);
+
+    // copy h_he to he(cpu)
+    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
+
+    //DEBUG_TRACE;
+    //he.correctBoundaryConditions();
+    //DEBUG_TRACE;
+
+#if defined DEBUG_
+    fvScalarMatrix EEqn
+    (
+
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
+            (
+                fvm::laplacian(turbulence->alphaEff(), he)
+            )
+        )
+    );
+    // EEqn.relax();
+    EEqn.solve("ha");
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+		// DEBUG_TRACE;
+		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
+		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+		// DEBUG_TRACE;
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+    }
+
+    delete h_boundary_he_tmp;
+
+#endif
+
+#else
     start1 = std::clock();
     fvScalarMatrix EEqn
     (
 
-            fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
-        +   fvc::ddt(rho, K) + fvc::div(phi, K)
-        -   dpdt
-        ==
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
             (
-                turbName == "laminar"
-                ?
-                (
-                    fvm::laplacian(turbulence->alpha(), he)
-                -   diffAlphaD
-                +   fvc::div(hDiffCorrFlux)
-                )
-                :
-                (
-                    fvm::laplacian(turbulence->alphaEff(), he)
-                )
+                fvm::laplacian(turbulence->alphaEff(), he)
             )
-        );
+        )
+    );
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-    EEqn.relax();
+    // EEqn.relax();
     start1 = std::clock();
     EEqn.solve("ha");
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
 }
diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
index 67c743453..668a3133a 100644
--- a/applications/solvers/dfLowMachFoam/Make/options
+++ b/applications/solvers/dfLowMachFoam/Make/options
@@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
     $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
     $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
-    $(if $(AMGX_DIR),-DGPUSolver_,) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
     -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
@@ -28,7 +27,7 @@ EXE_INC = -std=c++14 \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
     $(PYTHON_INC_DIR) \
     $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
-    $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
+    $(if $(AMGX_DIR), -I/usr/local/cuda/include,) \
     $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)
 
 EXE_LIBS = \
@@ -50,6 +49,7 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),-lpthread,) \
     $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
     $(if $(PYTHON_LIB_DIR),$(PYTHON_LIB_DIR),) \
-    $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
+    $(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \
+	$(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \
     $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
-    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
+    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
index 40067eac5..a0a9689b1 100644
--- a/applications/solvers/dfLowMachFoam/UEqn.H
+++ b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,24 +1,148 @@
-start1 = std::clock();
-tmp<fvVectorMatrix> tUEqn
-(
-    fvm::ddt(rho, U) + fvm::div(phi, U)
-  + turbulence->divDevRhoReff(U) 
-);
-fvVectorMatrix& UEqn = tUEqn.ref();
-
-end1 = std::clock();
-time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-UEqn.relax();
-start1 = std::clock();
-if (pimple.momentumPredictor())
-{
-    solve(UEqn == -fvc::grad(p));
+// Solve the Momentum equation
+#ifdef GPUSolverNew_
+
+#if defined DEBUG_
+    // run CPU, for temp
+    TICK_START;
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    TICK_STOP(CPU assembly time);
+
+    volTensorField gradU = fvc::grad(U);
+
+    double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
+        int patchsize = patchGradU.size();
+        if (patchGradU.type() == "processor"
+                || patchGradU.type() == "processorCyclic") {
+            // print info
+            if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
+                Info << "gradU transform = true" << endl;
+            } else {
+                Info << "gradU transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;
+
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            tensorField patchGradUInternal = 
+                    dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
+            memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize;
+        }
+    }
+#endif
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // postProcess
+    // TICK_START;
+    // UEqn_GPU.postProcess(h_u);
+    // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
+    // U.correctBoundaryConditions();
+    // K = 0.5*magSqr(U);
+    // DEBUG_TRACE;
+    // TICK_STOP(post process time);
 
+#if defined DEBUG_
+    // UEqn.relax();
+    TICK_START;
+    solve(UEqn == -fvc::grad(p));
+    K.oldTime();
     K = 0.5*magSqr(U);
-}
-end1 = std::clock();
-time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    TICK_STOP(CPU solve time);
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
+        else offset += patchsize;
+    }
+
+    double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
+            offset += 2 * patchsize;
+        } else {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+
+    bool printFlag = false;
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    
+    if (!mpi_init_flag || rank == 0) {
+        // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+        //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        //     // &gradU[0][0], h_boundary_gradU,
+        //     printFlag);
+        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+    }
+    DEBUG_TRACE;
+#endif
+
+#else
+    start1 = std::clock();
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) + fvm::div(phi, U)
+    + turbulence->divDevRhoReff(U) 
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
+    // UEqn.relax();
+    start1 = std::clock();
+    if (pimple.momentumPredictor())
+    {
+        solve(UEqn == -fvc::grad(p));
+        K.oldTime();
+        K = 0.5*magSqr(U);
+    }
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
diff --git a/applications/solvers/dfLowMachFoam/YEqn.H b/applications/solvers/dfLowMachFoam/YEqn.H
index f2b557b81..2120408d0 100644
--- a/applications/solvers/dfLowMachFoam/YEqn.H
+++ b/applications/solvers/dfLowMachFoam/YEqn.H
@@ -1,27 +1,67 @@
-hDiffCorrFlux = Zero;
-diffAlphaD = Zero;
-sumYDiffError = Zero;
+#ifdef GPUSolverNew_
+#if defined DEBUG_
+    hDiffCorrFlux = Zero;
+    diffAlphaD = Zero;
+    sumYDiffError = Zero;
 
-tmp<fv::convectionScheme<scalar>> mvConvection
-(
-    fv::convectionScheme<scalar>::New
+    tmp<fv::convectionScheme<scalar>> mvConvection
     (
-        mesh,
-        fields,
-        phi,
-        mesh.divScheme("div(phi,Yi_h)")
-    )
-);
-
-start1 = std::clock();
-forAll(Y, i)
-{
-    sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
-}
-const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
-start1 = std::clock();
-time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);  
+        fv::convectionScheme<scalar>::New
+        (
+            mesh,
+            fields,
+            phi,
+            mesh.divScheme("div(phi,Yi_h)")
+        )
+    );
+    // auto& mgcs = dynamic_cast<fv::multivariateGaussConvectionScheme<scalar>&>(mvConvection.ref());
+    // tmp<surfaceInterpolationScheme<scalar>> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]);
+    // tmp<surfaceScalarField> tweights = tinterpScheme_().weights(Y[0]);
+    // const surfaceScalarField& weights = tweights();
+    // Info << "CPU weights\n" << weights << endl;
+
+    // auto& limitedScheme_ = dynamic_cast<const limitedSurfaceInterpolationScheme<scalar>&>(tinterpScheme_());
+    // Info << "CPU limiter\n" << limitedScheme_.limiter(Y[0]) << endl;
+
+    forAll(Y, i)
+    {
+        sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
+    }
+    const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
+#endif
+#else // should only for CPUSolver
+    hDiffCorrFlux = Zero;
+    diffAlphaD = Zero;
+    sumYDiffError = Zero;
+
+    tmp<fv::convectionScheme<scalar>> mvConvection
+    (
+        fv::convectionScheme<scalar>::New
+        (
+            mesh,
+            fields,
+            phi,
+            mesh.divScheme("div(phi,Yi_h)")
+        )
+    );
+
+    // auto& mgcs = dynamic_cast<fv::multivariateGaussConvectionScheme<scalar>&>(mvConvection.ref());
+    // tmp<surfaceInterpolationScheme<scalar>> tinterpScheme_ = mgcs.interpolationScheme()()(Y[0]);
+    // tmp<surfaceScalarField> tweights = tinterpScheme_().weights(Y[0]);
+    // const surfaceScalarField& weights = tweights();
+    // Info << "CPU weights\n" << weights << endl;
+
+    start1 = std::clock();
+    forAll(Y, i)
+    {
+        sumYDiffError += chemistry->rhoD(i)*fvc::grad(Y[i]);
+    }
+    // Info << "sumYDiffError\n" << sumYDiffError << endl;
+    const surfaceScalarField phiUc = linearInterpolate(sumYDiffError) & mesh.Sf();
+    start1 = std::clock();
+    time_monitor_YEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);  
+#endif
 
 //MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 label flag_mpi_init;
@@ -29,6 +69,198 @@ MPI_Initialized(&flag_mpi_init);
 if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
 
 {
+
+#ifdef GPUSolverNew_
+#if defined DEBUG_
+    // run CPU
+    volScalarField Yt(0.0*Y[0]);
+    int speciesIndex = 0;
+    forAll(Y, i)
+    {
+        volScalarField& Yi = Y[i];
+        hDiffCorrFlux += chemistry->hai(i)*(chemistry->rhoD(i)*fvc::grad(Yi) - Yi*sumYDiffError);
+        diffAlphaD += fvc::laplacian(thermo.alpha()*chemistry->hai(i), Yi);
+        if (i != inertIndex)
+        {
+            start1 = std::clock();
+            tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+
+            fvScalarMatrix YiEqn
+            (
+                fvm::ddt(rho, Yi)
+            +
+                (
+                    turbName == "laminar"
+                    ?  (mvConvection->fvmDiv(phi, Yi) + mvConvection->fvmDiv(phiUc, Yi))
+                    :   mvConvection->fvmDiv(phi, Yi)
+                )
+            ==
+                (
+                    splitting
+                    ?   fvm::laplacian(DEff(), Yi)
+                    :  (fvm::laplacian(DEff(), Yi) + combustion->R(Yi))
+                    )
+            );
+            
+            end1 = std::clock();
+            time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
+            // YiEqn.relax();
+
+            start1 = std::clock();
+            YiEqn.solve("Yi");
+            end1 = std::clock();
+            time_monitor_YEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+
+            Yi.max(0.0);
+            Yt += Yi;
+            ++speciesIndex;
+        }
+    }
+    Y[inertIndex] = scalar(1) - Yt;
+    Y[inertIndex].max(0.0);
+
+    int specie_index = 0;
+
+    // should compute grad_yi before YiEqn.solve()
+    const volVectorField grad_yi = fvc::grad(Y[specie_index]);
+
+    tmp<volScalarField> DEff = chemistry->rhoD(specie_index) + turbulence->mut()/Sct;
+    fvScalarMatrix YiEqn
+        (
+         fvm::ddt(rho, Y[specie_index])
+         + mvConvection->fvmDiv(phi, Y[specie_index])
+         + mvConvection->fvmDiv(phiUc, Y[specie_index])
+         ==
+         fvm::laplacian(DEff(), Y[specie_index])
+        );
+    // YiEqn.relax();
+    // YiEqn.solve("Yi");
+    // Y[specie_index].max(0.0);
+#endif
+
+    // process
+    YEqn_GPU.process();
+    YEqn_GPU.sync();
+
+#if defined DEBUG_
+    std::vector<double> h_boundary_diffAlphaD;
+    std::vector<double> h_boundary_grad_yi;
+    std::vector<double> h_boundary_sumYDiffError;
+    std::vector<double> h_boundary_hDiffCorrFlux;
+    std::vector<double> h_boundary_phiUc;
+    h_boundary_diffAlphaD.resize(dfDataBase.num_boundary_surfaces);
+    h_boundary_grad_yi.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_sumYDiffError.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_hDiffCorrFlux.resize(dfDataBase.num_boundary_surfaces * 3);
+    h_boundary_phiUc.resize(dfDataBase.num_boundary_surfaces);
+    offset = 0;
+    forAll(diffAlphaD.boundaryField(), patchi)
+    {
+        //const scalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi];
+        const fvPatchScalarField& patchdiffAlphaD = diffAlphaD.boundaryField()[patchi];
+        const fvPatchVectorField& patchgradyi = grad_yi.boundaryField()[patchi];
+        const fvPatchVectorField& patchsumYDiffError = sumYDiffError.boundaryField()[patchi];
+        const fvPatchVectorField& patchhDiffCorrFlux = hDiffCorrFlux.boundaryField()[patchi];
+        const fvsPatchScalarField& patchphiUc = phiUc.boundaryField()[patchi];
+        int patchSize = patchdiffAlphaD.size();
+        if (patchdiffAlphaD.type() == "processor"
+            || patchdiffAlphaD.type() == "processorCyclic") {
+            scalarField patchdiffAlphaDInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchdiffAlphaD).patchInternalField()();
+            vectorField patchgradyiInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchgradyi).patchInternalField()();
+            vectorField patchsumYDiffErrorInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchsumYDiffError).patchInternalField()();
+            vectorField patchhDiffCorrFluxInternal = dynamic_cast<const processorFvPatchField<vector>&>(patchhDiffCorrFlux).patchInternalField()();
+            memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double));
+            memcpy(h_boundary_diffAlphaD.data() + offset + patchSize, &patchdiffAlphaDInternal[0], patchSize*sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + (offset + patchSize) * 3, &patchgradyiInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + (offset + patchSize) * 3, &patchsumYDiffErrorInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + (offset + patchSize) * 3, &patchhDiffCorrFluxInternal[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_diffAlphaD.data() + offset, &patchdiffAlphaD[0], patchSize*sizeof(double));
+            memcpy(h_boundary_grad_yi.data() + offset * 3, &patchgradyi[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_sumYDiffError.data() + offset * 3, &patchsumYDiffError[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_hDiffCorrFlux.data() + offset * 3, &patchhDiffCorrFlux[0][0], patchSize * 3 *sizeof(double));
+            memcpy(h_boundary_phiUc.data() + offset, &patchphiUc[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
+    }
+    DEBUG_TRACE;
+    // YEqn_GPU.comparediffAlphaD(&diffAlphaD[0], h_boundary_diffAlphaD.data(), false);
+    // YEqn_GPU.comparegradyi(&grad_yi[0][0], h_boundary_grad_yi.data(), specie_index, false);
+    // YEqn_GPU.comparesumYDiffError(&sumYDiffError[0][0], h_boundary_sumYDiffError.data(), false);
+    // YEqn_GPU.comparehDiffCorrFlux(&hDiffCorrFlux[0][0], h_boundary_hDiffCorrFlux.data(), false);
+    // YEqn_GPU.comparephiUc(&phiUc[0], h_boundary_phiUc.data(), false);
+    DEBUG_TRACE;
+
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> yeqn_h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> yeqn_h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(Y[specie_index].boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchYi = Y[specie_index].boundaryField()[patchi];
+        int patchsize = patchYi.size();
+        const double* internal_coeff_ptr = &YiEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &YiEqn.boundaryCoeffs()[patchi][0];
+        if (patchYi.type() == "processor"
+            || patchYi.type() == "processorCyclic") {
+            memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double));
+            memset(yeqn_h_internal_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double));
+            memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double));
+            memset(yeqn_h_boundary_coeffs.data() + offset + patchsize, 0, patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(yeqn_h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double));
+            memcpy(yeqn_h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    // NOTE: ldu and yi can't be compared at the same time
+    // to compare ldu data, you should open both DEBUG_ and DEBUG_CHECK_LDU in src_gpu
+    // to compare yi, you should only open DEBUG_ in src_gpu.
+    // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same.
+    //DEBUG_TRACE;
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // YEqn_GPU.compareResult(&YiEqn.lower()[0], &YiEqn.upper()[0], &YiEqn.diag()[0], &YiEqn.source()[0],
+        //         yeqn_h_internal_coeffs.data(), yeqn_h_boundary_coeffs.data(), printFlag);
+    }
+
+    DEBUG_TRACE;
+    // YEqn_GPU.compareYi(&Y[specie_index][0], specie_index, false);
+    // DEBUG_TRACE;
+#endif
+
+    // postProcess
+    double *h_y = dfDataBase.getFieldPointer("y", location::cpu, position::internal);
+    double *h_boundary_y = dfDataBase.getFieldPointer("y", location::cpu, position::boundary);
+    // YEqn_GPU.postProcess(h_y, h_boundary_y);
+    DEBUG_TRACE;
+
+    // copy h_y to Y(cpu)
+    // offset = 0;
+    // forAll(Y, i)
+    // {
+    //     volScalarField& Yi = Y[i];
+    //     memcpy(&Yi[0], h_y + offset, Yi.size() * sizeof(double));
+    //     offset += Yi.size();
+    //     Yi.correctBoundaryConditions();
+    // }
+    DEBUG_TRACE;
+
+    fflush(stderr);
+#else
     if (!splitting)
     {
         std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
@@ -72,7 +304,7 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
             
             end1 = std::clock();
             time_monitor_YEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-            YiEqn.relax();
+            // YiEqn.relax();
 
             start1 = std::clock();
             YiEqn.solve("Yi");
@@ -88,4 +320,5 @@ if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
     Y[inertIndex].max(0.0);
     end2 = std::clock();
     time_monitor_YEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
+#endif
 }
diff --git a/applications/solvers/dfLowMachFoam/createFields.H b/applications/solvers/dfLowMachFoam/createFields.H
index 15413d462..13a8fac4f 100644
--- a/applications/solvers/dfLowMachFoam/createFields.H
+++ b/applications/solvers/dfLowMachFoam/createFields.H
@@ -9,7 +9,9 @@ fluidThermo& thermo = *pThermo;
 
 const volScalarField& psi = thermo.psi();
 volScalarField& p = thermo.p();
+p.correctBoundaryConditions();
 volScalarField& T = thermo.T();
+T.correctBoundaryConditions();
 volScalarField rho
 (
     IOobject
@@ -37,6 +39,7 @@ volVectorField U
     ),
     mesh
 );
+U.correctBoundaryConditions();
 
 #include "compressibleCreatePhi.H"
 
@@ -87,6 +90,10 @@ const word turbName(mesh.objectRegistry::lookupObject<IOdictionary>("turbulenceP
 
 dfChemistryModel<basicThermo>* chemistry = combustion->chemistry();
 PtrList<volScalarField>& Y = chemistry->Y();
+forAll(Y, i)
+{
+    Y[i].correctBoundaryConditions();
+}
 const word inertSpecie(chemistry->lookup("inertSpecie"));
 const label inertIndex(chemistry->species()[inertSpecie]);
 chemistry->setEnergyName("ha");
@@ -98,6 +105,7 @@ if (combModelName != "flareFGM")
     chemistry->correctThermo();
     Info<< "At initial time, min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
 }
+rho = thermo.rho();
 
 //for dpdt
 
@@ -168,6 +176,22 @@ IOdictionary CanteraTorchProperties
         IOobject::NO_WRITE
     )
 );
+
+volScalarField UEqn_A
+(
+    IOobject
+    (
+        "A("+U.name()+')',
+        runTime.timeName(),
+        mesh,
+        IOobject::NO_READ,
+        IOobject::NO_WRITE
+    ),
+    mesh,
+    dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
+    extrapolatedCalculatedFvPatchScalarField::typeName
+);
+
 const Switch splitting = CanteraTorchProperties.lookupOrDefault("splittingStrategy", false);
 #ifdef USE_PYTORCH
     const Switch log_ = CanteraTorchProperties.subDict("TorchSettings").lookupOrDefault("log", false);
diff --git a/applications/solvers/dfLowMachFoam/createGPUSolver.H b/applications/solvers/dfLowMachFoam/createGPUSolver.H
new file mode 100644
index 000000000..79455fb22
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/createGPUSolver.H
@@ -0,0 +1,709 @@
+#include <queue>
+
+ncclUniqueId nccl_id;
+ncclComm_t nccl_comm;
+int nRanks, myRank, localRank, mpi_init_flag = 0;
+
+dfMatrixDataBase dfDataBase;
+dfThermo thermo_GPU(dfDataBase);
+dfChemistrySolver chemistrySolver_GPU(dfDataBase);
+dfRhoEqn rhoEqn_GPU(dfDataBase);
+dfUEqn UEqn_GPU(dfDataBase);
+dfYEqn YEqn_GPU(dfDataBase, chemistrySolver_GPU);
+dfEEqn EEqn_GPU(dfDataBase, thermo_GPU);
+dfpEqn pEqn_GPU(dfDataBase);
+
+#if defined(DEBUG_)
+template <typename T>
+void getTypeInfo(size_t *stride, size_t *internal_size, size_t *boundary_size) {
+    size_t s = 1;
+    bool isVol = false;
+    if (typeid(T) == typeid(surfaceScalarField)) {
+        s = 1;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceVectorField)) {
+        s = 3;
+        isVol = false;
+    } else if (typeid(T) == typeid(surfaceTensorField)) {
+        s = 9;
+        isVol = false;
+    } else if (typeid(T) == typeid(volScalarField)) {
+        s = 1;
+        isVol = true;
+    } else if (typeid(T) == typeid(volVectorField)) {
+        s = 3;
+        isVol = true;
+    } else if (typeid(T) == typeid(volTensorField)) {
+        s = 9;
+        isVol = true;
+    } else {
+        fprintf(stderr, "ERROR! Unsupported field type()!\n");
+        exit(EXIT_FAILURE);
+    }
+    *stride = s;
+    *internal_size = (isVol ? dfDataBase.num_cells : dfDataBase.num_surfaces) * s;
+    *boundary_size = dfDataBase.num_boundary_surfaces * s;
+}
+
+template <typename T>
+void getFieldPtr(std::queue<double*>& fieldPtrQue, T& field){
+    fieldPtrQue.push(&field[0]);
+    forAll(field.boundaryField(), patchi){
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        fieldPtrQue.push(&patchField[0]);
+    }
+};
+
+template <typename T>
+void randomInitField(T& field) {
+    size_t stride = 0;
+    size_t internal_size = 0;
+    size_t boundary_size = 0;
+    getTypeInfo<T>(&stride, &internal_size, &boundary_size);
+    size_t internal_value_bytes = internal_size * sizeof(double) * stride;
+    std::queue<double*> fieldPtrQue;
+    // std::vector<double*> fieldPtrQue;
+    getFieldPtr(fieldPtrQue, field);
+
+    // random init field value to (-0.5, 0.5)
+    // internal
+    double *&field_internal_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+    // double *field_internal_ptr = fieldPtrQue[0];
+    std::vector<double> init_field_internal;
+    init_field_internal.resize(internal_size * stride);
+    for (size_t i = 0; i < internal_size * stride; i++) {
+        init_field_internal[i] = (rand() % 10000 - 5000) / 10000.0;
+    }
+    memcpy(field_internal_ptr, init_field_internal.data(), internal_value_bytes);
+    // boundary
+    int ptrIndex = 1;
+    forAll(field.boundaryField(), patchi)
+    {
+        auto& patchField = field.boundaryFieldRef()[patchi];
+        size_t patchsize = patchField.size();
+        double *&field_boundary_ptr = fieldPtrQue.front(); fieldPtrQue.pop();
+        // double *field_boundary_ptr = fieldPtrQue[ptrIndex];
+        // ptrIndex ++;
+        std::vector<double> init_field_boundary;
+        init_field_boundary.resize(patchsize * stride);
+        for (size_t i = 0; i < patchsize * stride; i++) {
+            init_field_boundary[i] = (rand() % 10000 - 5000) / 10000.0;
+        }
+        memcpy(field_boundary_ptr, init_field_boundary.data(), patchsize * stride * sizeof(double));
+    }
+
+    field.correctBoundaryConditions();
+}
+#endif
+
+void initNccl() {
+    ncclInit(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, &nRanks, &myRank, &localRank, &mpi_init_flag);
+}
+
+void createGPUBase(const IOdictionary& CanteraTorchProperties, fvMesh& mesh, PtrList<volScalarField>& Y) {
+    // prepare constant values: num_cells, num_surfaces, num_boundary_surfaces,
+    // num_patches, patch_size, num_species, rdelta_t
+    const labelUList& owner = mesh.owner();
+    const labelUList& neighbour = mesh.neighbour();
+    int num_cells = mesh.nCells();
+    int num_total_cells = Foam::returnReduce(num_cells, sumOp<label>());
+    int num_surfaces = neighbour.size();
+    int num_boundary_surfaces = 0;
+    int num_patches = 0;
+    std::vector<int> patch_size;
+    forAll(mesh.boundary(), patchi) {
+        const fvPatchScalarField& patchY = Y[0].boundaryField()[patchi];
+        int patchsize = patchY.size();
+        patch_size.push_back(patchsize);
+        if (patchY.type() == "processor"
+            || patchY.type() == "processorCyclic") {
+            num_boundary_surfaces += patchsize * 2; // patchNeighbourfield and patchInternalfield
+        } else {
+            num_boundary_surfaces += patchsize;
+        }
+        num_patches++;
+    }
+    // prepare interface info
+    const globalIndex globalNumbering(num_cells);
+    const lduInterfacePtrsList interfaces(mesh.interfaces());
+    const lduAddressing& lduAddr = mesh.lduAddr();
+    labelList globalCells(num_cells);
+    forAll(globalCells, celli)
+    {
+        globalCells[celli] = globalNumbering.toGlobal(Pstream::myProcNo(), celli);
+    }
+    const label nReq = Pstream::nRequests();
+    label nProcValues = 0;
+    // send global cells
+    forAll(interfaces, patchi)
+    {
+        if (interfaces.set(patchi))
+        {
+            nProcValues += lduAddr.patchAddr(patchi).size();
+
+            // send patchInternalField
+            interfaces[patchi].initInternalFieldTransfer
+            (
+                Pstream::commsTypes::nonBlocking,
+                globalCells
+            );
+        }
+    }
+    // TODO: get deltaT fomr time API
+    double rDeltaT = 1 / 1e-6;
+    dfDataBase.setConstantValues(num_cells, num_total_cells, num_surfaces, num_boundary_surfaces, 
+            num_patches, nProcValues, patch_size, Y.size(), rDeltaT);
+
+    // wyr: now there is no other place to prepare nccl info, thus mpi must be initialized at beginning.
+    label flag_mpi_init;
+    MPI_Initialized(&flag_mpi_init);
+    if(flag_mpi_init) {
+        std::vector<int> GPUNeighbProcNo(dfDataBase.num_patches, -1);
+        // get basic communication info from of
+        forAll(Y[0].boundaryField(), patchi) {
+            if (Y[0].boundaryField()[patchi].type() == "processor"
+                || Y[0].boundaryField()[patchi].type() == "processorCyclic") {
+                GPUNeighbProcNo[patchi] = dynamic_cast<const processorFvPatchField<scalar>&>(Y[0].boundaryField()[patchi]).neighbProcNo();
+            }
+        }
+        // prepare mpi and nccl info
+        dfDataBase.setCommInfo(PstreamGlobals::MPI_COMM_FOAM, nccl_comm, nccl_id, nRanks, myRank, localRank, GPUNeighbProcNo);
+    }
+
+    // get cyclic neighbor when has cyclic patch
+    // - get boundary Index
+    std::map<std::string, int> patchNameToIndex;
+    forAll(Y[0].boundaryField(), patchi) {
+        patchNameToIndex[Y[0].boundaryField()[patchi].patch().name()] = patchi;
+    }
+    // - get cyclic neighbor
+    std::vector<int> cyclicNeighbor(dfDataBase.num_patches, -1);
+    forAll(Y[0].boundaryField(), patchi) {
+        if (Y[0].boundaryField()[patchi].type() == "cyclic") {
+            cyclicNeighbor[patchi] = patchNameToIndex[dynamic_cast<const cyclicFvPatchField<scalar>&>(Y[0].boundaryField()[patchi]).
+                    cyclicPatch().cyclicPatch().neighbPatch().name()];
+        }
+    }
+    dfDataBase.setCyclicInfo(cyclicNeighbor);
+
+    // prepare cuda resources
+    dfDataBase.prepareCudaResources();
+
+    // setup amgx solvers
+    string mode_string = "dDDI";
+    string u_setting_path;
+    u_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    string p_setting_path;
+    p_setting_path = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
+    dfDataBase.setAmgxSolvers(mode_string, u_setting_path, p_setting_path);
+
+    // prepare constant indexes: owner, neighbor, procRows, procCols
+    if (Pstream::parRun())
+    {
+        Pstream::waitRequests(nReq);
+    }
+    labelField procRows(nProcValues, 0);
+    labelField procCols(nProcValues, 0);
+    nProcValues = 0;
+
+    forAll(interfaces, patchi)
+    {
+        if (interfaces.set(patchi))
+        {
+            // local cell index
+            const labelUList& faceCells = lduAddr.patchAddr(patchi);
+            const label len = faceCells.size();
+
+            // global col index
+            labelField nbrCells
+            (
+                interfaces[patchi].internalFieldTransfer
+                (
+                    Pstream::commsTypes::nonBlocking,
+                    globalCells
+                )
+            );
+
+            if (faceCells.size() != nbrCells.size())
+            {
+                FatalErrorInFunction
+                    << "Mismatch in interface sizes (AMI?)" << nl
+                    << "Have " << faceCells.size() << " != "
+                    << nbrCells.size() << nl
+                    << exit(FatalError);
+            }
+
+            // for AMGx: Local rows, Global columns
+            SubList<label>(procRows, len, nProcValues) = faceCells;
+            SubList<label>(procCols, len, nProcValues) = nbrCells;
+            nProcValues += len;
+        }
+    }
+    label globalOffset = globalNumbering.toGlobal(0);
+    dfDataBase.setConstantIndexes(&owner[0], &neighbour[0], &procRows[0], &procCols[0], globalOffset);
+
+    // prepare internal and boundary of sf, mag_sf, weights, delta_coeffs, volume
+    double *boundary_sf = new double[3 * num_boundary_surfaces];
+    double *boundary_mag_sf = new double[num_boundary_surfaces];
+    double *boundary_delta_coeffs = new double[num_boundary_surfaces];
+    double *boundary_weights = new double[num_boundary_surfaces];
+    int *boundary_face_cell = new int[num_boundary_surfaces];
+    std::vector<int> patch_type_calculated(num_patches, 5); // default patch type is calculated
+    std::vector<int> patch_type_extropolated(num_patches, 8); // default patch type is extrapolated
+
+    int offset = 0;
+    forAll(mesh.boundary(), patchi) {
+        const fvPatchScalarField& patchY = Y[0].boundaryField()[patchi];
+        const vectorField& pSf = mesh.Sf().boundaryField()[patchi];
+        const scalarField& pMagSf = mesh.magSf().boundaryField()[patchi];
+        const scalarField& pDeltaCoeffs = mesh.deltaCoeffs().boundaryField()[patchi];
+        const scalarField& pWeights = mesh.surfaceInterpolation::weights().boundaryField()[patchi];
+        const labelUList& pFaceCells = mesh.boundary()[patchi].faceCells();
+
+        int patchsize = pMagSf.size();
+
+        if (patchY.type() == "processor") {
+            patch_type_calculated[patchi] = 7; // patchi type is processor
+            patch_type_extropolated[patchi] = 7; // patchi type is processor
+
+            memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+            memcpy(boundary_sf + 3*offset + 3*patchsize, &pSf[0][0], 3*patchsize*sizeof(double));
+
+            memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+            memcpy(boundary_mag_sf + offset + patchsize, &pMagSf[0], patchsize*sizeof(double));
+
+            memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+            memcpy(boundary_delta_coeffs + offset + patchsize, &pDeltaCoeffs[0], patchsize*sizeof(double));
+
+            memcpy(boundary_weights + offset, &pWeights[0], patchsize*sizeof(double));
+            memcpy(boundary_weights + offset + patchsize, &pWeights[0], patchsize*sizeof(double));
+
+            memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+            memcpy(boundary_face_cell + offset + patchsize, &pFaceCells[0], patchsize * sizeof(int));
+
+            offset += patchsize * 2;
+        } else if (patchY.type() == "processorCyclic") {
+            patch_type_calculated[patchi] = 10; // patchi type is processorCyclic
+            patch_type_extropolated[patchi] = 10; // patchi type is processorCyclic
+
+            memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+            memcpy(boundary_sf + 3*offset + 3*patchsize, &pSf[0][0], 3*patchsize*sizeof(double));
+
+            memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+            memcpy(boundary_mag_sf + offset + patchsize, &pMagSf[0], patchsize*sizeof(double));
+
+            memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+            memcpy(boundary_delta_coeffs + offset + patchsize, &pDeltaCoeffs[0], patchsize*sizeof(double));
+
+            memcpy(boundary_weights + offset, &pWeights[0], patchsize*sizeof(double));
+            memcpy(boundary_weights + offset + patchsize, &pWeights[0], patchsize*sizeof(double));
+
+            memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+            memcpy(boundary_face_cell + offset + patchsize, &pFaceCells[0], patchsize * sizeof(int));
+
+            offset += patchsize * 2;
+        } else if (patchY.type() == "cyclic") {
+            patch_type_calculated[patchi] = 6; // patchi type is cyclic
+            patch_type_extropolated[patchi] = 6; // patchi type is cyclic
+
+            memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+            memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+            memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+            memcpy(boundary_weights + offset, &pWeights[0], patchsize*sizeof(double));
+            memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+
+            offset += patchsize;
+        } else {
+            memcpy(boundary_sf + 3*offset, &pSf[0][0], 3*patchsize*sizeof(double));
+            memcpy(boundary_mag_sf + offset, &pMagSf[0], patchsize*sizeof(double));
+            memcpy(boundary_delta_coeffs + offset, &pDeltaCoeffs[0], patchsize*sizeof(double));
+            memcpy(boundary_weights + offset, &pWeights[0], patchsize*sizeof(double));
+            memcpy(boundary_face_cell + offset, &pFaceCells[0], patchsize * sizeof(int));
+
+            offset += patchsize;
+        }
+    }
+
+    dfDataBase.createConstantFieldsInternal();
+    dfDataBase.createConstantFieldsBoundary();
+
+    // construct mesh distance for limitedLinear scheme
+    vectorField meshDistance = mesh.Sf();
+    forAll(meshDistance, facei) {
+        label own = owner[facei];
+        label nei = neighbour[facei];
+        meshDistance[facei] = mesh.C()[nei] - mesh.C()[own];
+    }
+
+    dfDataBase.initConstantFieldsInternal(&mesh.Sf()[0][0], &mesh.magSf()[0], &mesh.surfaceInterpolation::weights()[0], 
+            &mesh.deltaCoeffs()[0], &mesh.V()[0], &meshDistance[0][0]);
+    dfDataBase.initConstantFieldsBoundary(boundary_sf, boundary_mag_sf, boundary_delta_coeffs, boundary_weights, boundary_face_cell, 
+            patch_type_calculated, patch_type_extropolated);
+    
+    dfDataBase.createNonConstantFieldsInternal();
+    dfDataBase.createNonConstantFieldsBoundary();
+
+    delete boundary_sf;
+    delete boundary_mag_sf;
+    delete boundary_delta_coeffs;
+    delete boundary_weights;
+    delete boundary_face_cell;
+}
+
+void createGPURhoEqn(const volScalarField& rho, const surfaceScalarField& phi) {
+    DEBUG_TRACE;
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::internal);
+    double *h_boundary_phi = dfDataBase.getFieldPointer("phi", location::cpu, position::boundary);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
+    memcpy(h_phi, &phi[0], dfDataBase.surface_value_bytes);
+    memcpy(h_rho, &rho[0], dfDataBase.cell_value_bytes);
+
+    int offset = 0;
+    forAll(rho.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), rho.boundaryField()[patchi].type());
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        const fvsPatchScalarField& patchPhi = phi.boundaryField()[patchi];
+        int patchsize = patchRho.size();
+        if (patchRho.type() == "processor"
+            || patchRho.type() == "processorCyclic") {
+            if (dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).doTransform()) {
+                Info << "gradU transform = true" << endl;
+            } else {
+                Info << "gradU transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).rank() << endl;
+
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            scalarField patchRhoInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).patchInternalField()();
+            memcpy(h_boundary_rho + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+            memset(h_boundary_phi + offset + patchsize, 0, patchsize * sizeof(double));
+
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            memcpy(h_boundary_phi + offset, &patchPhi[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    rhoEqn_GPU.setConstantValues();
+    rhoEqn_GPU.setConstantFields(patch_type);
+    rhoEqn_GPU.initNonConstantFields(h_rho, h_phi, h_boundary_rho, h_boundary_phi);
+    rhoEqn_GPU.createNonConstantLduAndCsrFields();
+}
+
+void createGPUUEqn(const IOdictionary& CanteraTorchProperties, const volVectorField& U) {
+    // TODO need remove amgx solver setting
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    UEqn_GPU.setConstantValues(mode_string, settingPath);
+
+    // prepare patch_type
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+
+    double *h_u = dfDataBase.getFieldPointer("u", location::cpu, position::internal);
+    double *h_boundary_u = dfDataBase.getFieldPointer("u", location::cpu, position::boundary);
+    memcpy(h_u, &U[0][0], dfDataBase.cell_value_vec_bytes);
+    
+    int offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), U.boundaryField()[patchi].type());
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = patchU.size();
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            if (dynamic_cast<const processorFvPatchField<vector>&>(patchU).doTransform()) {
+                Info << "U transform = true" << endl;
+            } else {
+                Info << "U transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<vector>&>(patchU).rank() << endl;
+
+            memcpy(h_boundary_u + 3*offset, &patchU[0][0], patchsize * 3 * sizeof(double));
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u + 3*offset + 3*patchsize, &patchUInternal[0][0], patchsize * 3 * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_u + 3*offset, &patchU[0][0], patchsize * 3 * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    UEqn_GPU.setConstantFields(patch_type);
+
+    // prepare internal and boundary of xxx
+    UEqn_GPU.createNonConstantFieldsInternal();
+    UEqn_GPU.createNonConstantFieldsBoundary();
+    UEqn_GPU.createNonConstantLduAndCsrFields();
+    // UEqn_GPU has no internal non-constant fields to be init
+    UEqn_GPU.initNonConstantFieldsInternal(h_u, h_boundary_u);
+    UEqn_GPU.initNonConstantFieldsBoundary();
+}
+
+void createGPUYEqn(const IOdictionary& CanteraTorchProperties, PtrList<volScalarField>& Y, const int inertIndex) {
+    DEBUG_TRACE;
+    // TODO need remove amgx solver setting
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    YEqn_GPU.setConstantValues(mode_string, settingPath, inertIndex);
+
+    // prepare patch_type
+    std::vector<int> patch_type;
+    patch_type.resize(dfDataBase.num_patches);
+    fprintf(stderr, "num_species: %d\n", dfDataBase.num_species);
+    forAll(Y[0].boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type[patchi]), Y[0].boundaryField()[patchi].type());
+    }
+    // set lewis number
+    std::vector<double> lewis_number(dfDataBase.num_species, 1.); // unity lewis
+    YEqn_GPU.setConstantFields(patch_type, lewis_number);
+
+    // prepare internal and boundary of xxx
+    YEqn_GPU.createNonConstantFieldsInternal();
+    YEqn_GPU.createNonConstantFieldsBoundary();
+    YEqn_GPU.createNonConstantLduAndCsrFields();
+
+    // prepare internal and boundary of Y
+    int offset = 0;
+    forAll(Y, speciesI) {
+        volScalarField& Yi = Y[speciesI];
+        memcpy(dfDataBase.h_y + speciesI * dfDataBase.num_cells, &Yi[0], dfDataBase.num_cells * sizeof(double));
+        forAll(Yi.boundaryField(), patchi) {
+            const fvPatchScalarField& patchYi = Yi.boundaryField()[patchi];
+            int patchsize = patchYi.size();
+            if (patchYi.type() == "processor"
+                || patchYi.type() == "processorCyclic") {
+                scalarField patchYiInternal =
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchYi).patchInternalField()();
+                memcpy(dfDataBase.h_boundary_y + offset, &patchYi[0], patchsize * sizeof(double));
+                memcpy(dfDataBase.h_boundary_y + offset + patchsize, &patchYiInternal[0], patchsize * sizeof(double));
+                offset += patchsize * 2;
+            } else {
+                memcpy(dfDataBase.h_boundary_y + offset, &patchYi[0], patchsize*sizeof(double));
+                offset += patchsize;
+            }
+        }
+    }
+    YEqn_GPU.initNonConstantFieldsInternal(dfDataBase.h_y);
+    YEqn_GPU.initNonConstantFieldsBoundary(dfDataBase.h_boundary_y);
+}
+
+void createGPUEEqn(const IOdictionary& CanteraTorchProperties, volScalarField& he, volScalarField& K) {
+    DEBUG_TRACE;
+    // TODO need remove amgx solver setting
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("UEqnSettingPath", string(""));
+    EEqn_GPU.setConstantValues(mode_string, settingPath);
+
+    // prepare patch_type
+    std::vector<int> patch_type_he(dfDataBase.num_patches), patch_type_k(dfDataBase.num_patches);
+    forAll(he.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type_he[patchi]), he.boundaryField()[patchi].type());
+        constructBoundarySelectorPerPatch(&(patch_type_k[patchi]), K.boundaryField()[patchi].type());
+    }
+    EEqn_GPU.setConstantFields(patch_type_he, patch_type_k);
+
+    // prepare internal and boundary of xxx
+    EEqn_GPU.createNonConstantFieldsInternal();
+    EEqn_GPU.createNonConstantFieldsBoundary();
+    EEqn_GPU.createNonConstantLduAndCsrFields();
+
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+    memcpy(h_he, &he[0], dfDataBase.cell_value_bytes);
+    int offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchsize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            scalarField patchHeInternal =
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he + offset, &patchHe[0], patchsize * sizeof(double));
+            memcpy(h_boundary_he + offset + patchsize, &patchHeInternal[0], patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_he + offset, &patchHe[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+
+    EEqn_GPU.initNonConstantFields(h_he, h_boundary_he);
+}
+
+void createGPUpEqn(const IOdictionary& CanteraTorchProperties, volScalarField& p, const volVectorField& U) {
+    DEBUG_TRACE;
+    // TODO need remove amgx solver setting
+    // prepare mode_string and setting_path
+    string mode_string = "dDDI";
+    string settingPath;
+    settingPath = CanteraTorchProperties.subDict("AmgxSettings").lookupOrDefault("pEqnSettingPath", string(""));
+    pEqn_GPU.setConstantValues(mode_string, settingPath);
+    
+    // prepare patch_type
+    std::vector<int> patch_type_p(dfDataBase.num_patches);
+    std::vector<int> patch_type_U(dfDataBase.num_patches);
+
+    double *h_p = dfDataBase.getFieldPointer("p", location::cpu, position::internal);
+    double *h_boundary_p = dfDataBase.getFieldPointer("p", location::cpu, position::boundary);
+    memcpy(h_p, &p[0], dfDataBase.cell_value_bytes);
+
+    int offset = 0;
+    forAll(p.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type_p[patchi]), p.boundaryField()[patchi].type());
+        constructBoundarySelectorPerPatch(&(patch_type_U[patchi]), U.boundaryField()[patchi].type());
+        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+        int patchsize = patchP.size();
+        if (patchP.type() == "processor"
+            || patchP.type() == "processorCyclic") {
+            memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+            scalarField patchPInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchP).patchInternalField()();
+            memcpy(h_boundary_p + offset + patchsize, &patchPInternal[0], patchsize * sizeof(double));
+
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    pEqn_GPU.setConstantFields(patch_type_U, patch_type_p);
+    pEqn_GPU.initNonConstantFields(h_p, h_boundary_p);
+
+    // prepare internal and boundary of xxx
+    pEqn_GPU.createNonConstantFieldsInternal();
+    pEqn_GPU.createNonConstantFieldsBoundary();
+    pEqn_GPU.createNonConstantLduAndCsrFields();
+}
+
+void createGPUThermo(const IOdictionary& CanteraTorchProperties, volScalarField& T, volScalarField& he, 
+        const volScalarField& psi, const volScalarField& alpha, const volScalarField& mu,
+        const volScalarField& K, const volScalarField& dpdt, dfChemistryModel<basicThermo>* chemistry) {
+    DEBUG_TRACE;
+    // initialize dfThermo
+    string mechanismFile;
+    mechanismFile = CanteraTorchProperties.lookupOrDefault("CanteraMechanismFile", string(""));
+
+    thermo_GPU.setConstantValue(mechanismFile, dfDataBase.num_cells, dfDataBase.num_species);
+    init_const_coeff_ptr(thermo_GPU.nasa_coeffs, thermo_GPU.viscosity_coeffs, thermo_GPU.thermal_conductivity_coeffs, 
+            thermo_GPU.binary_diffusion_coeffs, thermo_GPU.molecular_weights);
+
+    // thermal variables in dataBase
+    // TODO: note that h_he & h_boundary_he are transfered to GPU in EEqn_GPU, too. We should delete one of them.
+    double *h_boundary_T = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_he = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_thermo_psi = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_thermo_alpha = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_mu = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_k = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_thermo_rhoD = new double[dfDataBase.num_boundary_surfaces * dfDataBase.num_species];
+    double *h_thermo_rhoD = new double[dfDataBase.num_cells * dfDataBase.num_species];
+
+    // initialize thermo boundary
+    std::vector<int> patch_type_T(dfDataBase.num_patches);
+    int offset = 0;
+    forAll(T.boundaryField(), patchi)
+    {
+        constructBoundarySelectorPerPatch(&(patch_type_T[patchi]), T.boundaryField()[patchi].type());
+        const fvPatchScalarField& patchT = T.boundaryField()[patchi];
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        const fvPatchScalarField& patchPsi = psi.boundaryField()[patchi];
+        const fvPatchScalarField& patchAlpha = alpha.boundaryField()[patchi];
+        const fvPatchScalarField& patchMu = mu.boundaryField()[patchi];
+        const fvPatchScalarField& patchK = K.boundaryField()[patchi];
+
+        int patchsize = patchT.size();
+        if (patchT.type() == "processor"
+            || patchT.type() == "processorCyclic") {
+            memcpy(h_boundary_T + offset, &patchT[0], patchsize * sizeof(double));
+            scalarField patchTInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchT).patchInternalField()();
+            memcpy(h_boundary_T + offset + patchsize, &patchTInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_he + offset, &patchHe[0], patchsize * sizeof(double));
+            scalarField patchHeInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he + offset + patchsize, &patchHeInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_thermo_psi + offset, &patchPsi[0], patchsize * sizeof(double));
+            scalarField patchPsiInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchPsi).patchInternalField()();
+            memcpy(h_boundary_thermo_psi + offset + patchsize, &patchPsiInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_thermo_alpha + offset, &patchAlpha[0], patchsize * sizeof(double));
+            scalarField patchAlphaInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchAlpha).patchInternalField()();
+            memcpy(h_boundary_thermo_alpha + offset + patchsize, &patchAlphaInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_mu + offset, &patchMu[0], patchsize * sizeof(double));
+            scalarField patchMuInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchMu).patchInternalField()();
+            memcpy(h_boundary_mu + offset + patchsize, &patchMuInternal[0], patchsize * sizeof(double));
+
+            memcpy(h_boundary_k + offset, &patchK[0], patchsize * sizeof(double));
+            scalarField patchKInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchK).patchInternalField()();
+            memcpy(h_boundary_k + offset + patchsize, &patchKInternal[0], patchsize * sizeof(double));
+
+            for (int i = 0; i < dfDataBase.num_species; i++) {
+                const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi];
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double));
+                scalarField patchRhoDInternal = 
+                        dynamic_cast<const processorFvPatchField<scalar>&>(patchRhoD).patchInternalField()();
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double));
+            }
+
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_T + offset, &patchT[0], patchsize * sizeof(double));
+            memcpy(h_boundary_he + offset, &patchHe[0], patchsize * sizeof(double));
+            memcpy(h_boundary_thermo_psi + offset, &patchPsi[0], patchsize * sizeof(double));
+            memcpy(h_boundary_thermo_alpha + offset, &patchAlpha[0], patchsize * sizeof(double));
+            memcpy(h_boundary_mu + offset, &patchMu[0], patchsize * sizeof(double));
+            memcpy(h_boundary_k + offset, &patchK[0], patchsize * sizeof(double));
+
+            for (int i = 0; i < dfDataBase.num_species; i++) {
+                const fvPatchScalarField& patchRhoD = chemistry->rhoD(i).boundaryField()[patchi];
+                memcpy(h_boundary_thermo_rhoD + i * dfDataBase.num_boundary_surfaces + offset, &patchRhoD[0], patchsize * sizeof(double));
+            }
+            offset += patchsize;
+        }
+    }
+    for (int i = 0; i < dfDataBase.num_species; i++) {
+        memcpy(h_thermo_rhoD + i * dfDataBase.num_cells, &chemistry->rhoD(i)[0], dfDataBase.num_cells * sizeof(double));
+    }
+    double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal);
+    memcpy(h_T, &T[0], dfDataBase.cell_value_bytes);
+
+    thermo_GPU.setConstantFields(patch_type_T);
+    thermo_GPU.initNonConstantFields(h_T, &he[0], &psi[0], &alpha[0], &mu[0], &K[0], &dpdt[0], h_thermo_rhoD,
+            h_boundary_T, h_boundary_he, h_boundary_thermo_psi, h_boundary_thermo_alpha, h_boundary_mu, h_boundary_k, h_boundary_thermo_rhoD);
+
+    delete h_boundary_T;
+    delete h_boundary_he;
+    delete h_boundary_thermo_psi;
+    delete h_boundary_thermo_alpha;
+    delete h_boundary_mu;
+    delete h_boundary_k;
+    delete h_boundary_thermo_rhoD;
+    delete h_thermo_rhoD;
+}
diff --git a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
index da81dc9ea..6191067e4 100644
--- a/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
+++ b/applications/solvers/dfLowMachFoam/dfLowMachFoam.C
@@ -32,7 +32,7 @@ Description
     pseudo-transient simulations.
 
 \*---------------------------------------------------------------------------*/
-
+#include "stdlib.h"
 #include "dfChemistryModel.H"
 #include "CanteraMixture.H"
 // #include "hePsiThermo.H"
@@ -60,14 +60,55 @@ Description
 #include "basicThermo.H"
 #include "CombustionModel.H"
 
-#ifdef GPUSolver_
-#include "dfUEqn.H"
-#include "dfYEqn.H"
-#include "dfRhoEqn.H"
-#include "dfEEqn.H"
-#include <cuda_runtime.h>
-#include <thread>
-#include "upwind.H"
+// #define GPUSolverNew_
+// #define TIME
+// #define DEBUG_
+// #define SHOW_MEMINFO
+
+#ifdef GPUSolverNew_
+    #include "dfMatrixDataBase.H"
+    #include "AmgXSolver.H"
+    #include "dfUEqn.H"
+    #include "dfYEqn.H"
+    #include "dfRhoEqn.H"
+    #include "dfEEqn.H"
+    #include "dfpEqn.H"
+    #include "dfMatrixOpBase.H"
+    #include "dfNcclBase.H"
+    #include "dfThermo.H"
+    #include "dfChemistrySolver.H"
+    #include <cuda_runtime.h>
+    #include <thread>
+
+    #include "processorFvPatchField.H"
+    #include "cyclicFvPatchField.H"
+    #include "processorCyclicFvPatchField.H"
+    #include "createGPUSolver.H"
+
+    #include "upwind.H"
+    #include "CanteraMixture.H"
+    #include "multivariateGaussConvectionScheme.H"
+    #include "limitedSurfaceInterpolationScheme.H"
+#else
+    #include "processorFvPatchField.H"
+    #include "cyclicFvPatchField.H"
+    #include "multivariateGaussConvectionScheme.H"
+    #include "limitedSurfaceInterpolationScheme.H"
+    int myRank = -1;
+    int mpi_init_flag = 0;
+#endif
+
+int offset;
+
+#ifdef TIME
+    #define TICK_START \
+        start_new = std::clock(); 
+    #define TICK_STOP(prefix) \
+        stop_new = std::clock(); \
+        Foam::Info << #prefix << " time = " << double(stop_new - start_new) / double(CLOCKS_PER_SEC) << " s" << Foam::endl;
+#else
+    #define TICK_START
+    #define TICK_STOP(prefix)
 #endif
 
 // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * //
@@ -148,6 +189,8 @@ int main(int argc, char *argv[])
 
     label timeIndex = 0;
     clock_t start, end, start1, end1, start2, end2;
+    clock_t start_new, stop_new;
+    double time_new = 0;
 
     turbulence->validate();
 
@@ -158,9 +201,32 @@ int main(int argc, char *argv[])
     }
 
     start1 = std::clock();
-    #ifdef GPUSolver_
-    #include "createdfSolver.H"
-    #endif
+#ifdef GPUSolverNew_
+    int mpi_init_flag;
+    checkMpiErrors(MPI_Initialized(&mpi_init_flag));
+    if(mpi_init_flag) {
+        initNccl();
+    }
+    createGPUBase(CanteraTorchProperties, mesh, Y);
+    DEBUG_TRACE;
+#endif
+
+#ifdef GPUSolverNew_
+    createGPUUEqn(CanteraTorchProperties, U);
+    createGPUYEqn(CanteraTorchProperties, Y, inertIndex);
+    createGPUEEqn(CanteraTorchProperties, thermo.he(), K);
+    createGPUpEqn(CanteraTorchProperties, p, U);
+    createGPURhoEqn(rho, phi);
+
+    const volScalarField& mu = thermo.mu();
+    const volScalarField& alpha = thermo.alpha();
+    createGPUThermo(CanteraTorchProperties, T, thermo.he(), psi, alpha, mu, K, dpdt, chemistry);
+    if (chemistry->ifChemstry())
+    {
+        chemistrySolver_GPU.setConstantValue(dfDataBase.num_cells, dfDataBase.num_species, 4096);
+    }
+#endif
+
     end1 = std::clock();
     time_monitor_init += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
@@ -187,7 +253,11 @@ int main(int argc, char *argv[])
         runTime++;
 
         Info<< "Time = " << runTime.timeName() << nl << endl;
-
+        
+        // store old time fields
+#ifdef GPUSolverNew_
+        dfDataBase.preTimeStep();
+#endif
         clock_t loop_start = std::clock();
         // --- Pressure-velocity PIMPLE corrector loop
         while (pimple.loop())
@@ -246,21 +316,142 @@ int main(int argc, char *argv[])
                 end = std::clock();
                 time_monitor_E += double(end - start) / double(CLOCKS_PER_SEC);
 
-                start = std::clock();
+            start = std::clock();
+            #ifdef GPUSolverNew_
+                thermo_GPU.correctThermo();
+                thermo_GPU.sync();
+            #if defined DEBUG_
+                // check correctThermo
+                int speciesIndex = 6;
+                chemistry->correctThermo(); // reference
+                double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_mu_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_rho_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_thermo_alpha_tmp = new double[dfDataBase.num_boundary_surfaces];    
+                double *h_boundary_thermo_psi_tmp = new double[dfDataBase.num_boundary_surfaces];
+                double *h_boundary_thermo_rhoD_tmp = new double[dfDataBase.num_boundary_surfaces];
+                offset = 0;
+                forAll(T.boundaryField(), patchi)
+                {
+                    const fvPatchScalarField& patchHe = thermo.he().boundaryField()[patchi];
+                    const fvPatchScalarField& patchMu = mu.boundaryField()[patchi];
+                    const fvPatchScalarField& patchPsi = psi.boundaryField()[patchi];
+                    const fvPatchScalarField& patchAlpha = alpha.boundaryField()[patchi];
+                    const fvPatchScalarField& patchRho = thermo.rho()().boundaryField()[patchi];
+                    const fvPatchScalarField& patchT = T.boundaryField()[patchi];
+                    const fvPatchScalarField& patchRhoD = chemistry->rhoD(speciesIndex).boundaryField()[patchi];
+
+                    int patchsize = patchT.size();
+                    if (patchT.type() == "processor") {
+                        memcpy(h_boundary_T_tmp + offset, &patchT[0], patchsize * sizeof(double));
+                        scalarField patchTInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchT).patchInternalField()();
+                        memcpy(h_boundary_T_tmp + offset + patchsize, &patchTInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchsize * sizeof(double));
+                        scalarField patchHeInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+                        memcpy(h_boundary_he_tmp + offset + patchsize, &patchHeInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_thermo_psi_tmp + offset, &patchPsi[0], patchsize * sizeof(double));
+                        scalarField patchPsiInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchPsi).patchInternalField()();
+                        memcpy(h_boundary_thermo_psi_tmp + offset + patchsize, &patchPsiInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_thermo_alpha_tmp + offset, &patchAlpha[0], patchsize * sizeof(double));
+                        scalarField patchAlphaInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchAlpha).patchInternalField()();
+                        memcpy(h_boundary_thermo_alpha_tmp + offset + patchsize, &patchAlphaInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_mu_tmp + offset, &patchMu[0], patchsize * sizeof(double));
+                        scalarField patchMuInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchMu).patchInternalField()();
+                        memcpy(h_boundary_mu_tmp + offset + patchsize, &patchMuInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_rho_tmp + offset, &patchRho[0], patchsize * sizeof(double));
+                        scalarField patchRhoInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).patchInternalField()();
+                        memcpy(h_boundary_rho_tmp + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double));
+
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double));
+                        scalarField patchRhoDInternal = 
+                                dynamic_cast<const processorFvPatchField<scalar>&>(patchRhoD).patchInternalField()();
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset + patchsize, &patchRhoDInternal[0], patchsize * sizeof(double));
+
+                        offset += patchsize * 2;
+                    } else {
+                        memcpy(h_boundary_T_tmp + offset, &patchT[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_thermo_psi_tmp + offset, &patchPsi[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_thermo_alpha_tmp + offset, &patchAlpha[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_mu_tmp + offset, &patchMu[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_rho_tmp + offset, &patchRho[0], patchsize * sizeof(double));
+                        memcpy(h_boundary_thermo_rhoD_tmp + offset, &patchRhoD[0], patchsize * sizeof(double));
+
+                        offset += patchsize;
+                    }
+                }
+                bool printFlag = false;
+                int rank = -1;
+                if (mpi_init_flag) {
+                    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+                }
+                if (!mpi_init_flag || rank == 0) {
+                    // thermo_GPU.compareT(&T[0], h_boundary_T_tmp, printFlag);
+                    // thermo_GPU.compareHe(&thermo.he()[0], h_boundary_he_tmp, printFlag);
+                    // thermo_GPU.comparePsi(&psi[0], h_boundary_thermo_psi_tmp, printFlag);
+                    // thermo_GPU.compareAlpha(&alpha[0], h_boundary_thermo_alpha_tmp, printFlag);
+                    // thermo_GPU.compareMu(&mu[0], h_boundary_mu_tmp, printFlag);
+                    // thermo_GPU.compareRho(&thermo.rho()()[0], h_boundary_rho_tmp, printFlag);
+                    // thermo_GPU.compareRhoD(&chemistry->rhoD(speciesIndex)[0], h_boundary_thermo_rhoD_tmp, speciesIndex, printFlag);
+                }
+
+                delete h_boundary_T_tmp;
+                delete h_boundary_he_tmp;
+                delete h_boundary_thermo_psi_tmp;
+                delete h_boundary_thermo_alpha_tmp;
+                delete h_boundary_mu_tmp;
+                delete h_boundary_rho_tmp;
+            #endif
+            #else
                 chemistry->correctThermo();
-                end = std::clock();
-                time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC);
+            #endif
+            end = std::clock();
+            time_monitor_chemistry_correctThermo += double(end - start) / double(CLOCKS_PER_SEC);
             }
             else
             {
                 combustion->correct();
             }
+            // update T for debug
+            #ifdef GPUSolverNew_
+            double *h_T = dfDataBase.getFieldPointer("T", location::cpu, position::internal);
+            double *h_boundary_T_tmp = new double[dfDataBase.num_boundary_surfaces];
+            thermo_GPU.updateCPUT(h_T, h_boundary_T_tmp);
+            memcpy(&T[0], h_T, T.size() * sizeof(double));
+            offset = 0;
+            forAll(T.boundaryField(), patchi) {
+                const fvPatchScalarField& const_patchT = T.boundaryField()[patchi];
+                fvPatchScalarField& patchT = const_cast<fvPatchScalarField&>(const_patchT);
+                int patchsize = patchT.size();
+                if (patchT.type() == "processor") {
+                    memcpy(&patchT[0], h_boundary_T_tmp + offset, patchsize * sizeof(double));
+                    offset += patchsize * 2;
+                } else {
+                    memcpy(&patchT[0], h_boundary_T_tmp + offset, patchsize * sizeof(double));
+                    offset += patchsize;
+                }
+            }
+            delete h_boundary_T_tmp;
+            #endif
 
             Info<< "min/max(T) = " << min(T).value() << ", " << max(T).value() << endl;
 
             // --- Pressure corrector loop
 
             start = std::clock();
+            int num_pimple_loop = pimple.nCorrPimple();
             while (pimple.correct())
             {
                 if (pimple.consistent())
@@ -269,8 +460,15 @@ int main(int argc, char *argv[])
                 }
                 else
                 {
-                    #include "pEqn.H"
+
+                #if defined GPUSolverNew_
+                    #include "pEqn_GPU.H"
+                #else
+                    #include "pEqn_CPU.H"
+                #endif
+                
                 }
+                num_pimple_loop --;
             }
             end = std::clock();
             time_monitor_p += double(end - start) / double(CLOCKS_PER_SEC);
@@ -286,7 +484,22 @@ int main(int argc, char *argv[])
         clock_t loop_end = std::clock();
         double loop_time = double(loop_end - loop_start) / double(CLOCKS_PER_SEC);
 
+#ifdef GPUSolverNew_
+        thermo_GPU.updateRho();
+        dfDataBase.postTimeStep();
+#if defined DEBUG_
         rho = thermo.rho();
+#endif
+#else
+        rho = thermo.rho();
+#endif
+
+#ifdef GPUSolverNew_
+        // write U
+        UEqn_GPU.postProcess();
+        memcpy(&U[0][0], dfDataBase.h_u, dfDataBase.cell_value_vec_bytes);
+        U.correctBoundaryConditions();
+#endif
 
         runTime.write();
         Info<< "========Time Spent in diffenet parts========"<< endl;
@@ -349,6 +562,19 @@ int main(int argc, char *argv[])
         Info<< "ExecutionTime = " << runTime.elapsedCpuTime() << " s"
             << "  ClockTime = " << runTime.elapsedClockTime() << " s" << endl;
 
+#ifdef GPUSolverNew_
+#ifdef SHOW_MEMINFO
+	int rank = -1;
+	if (mpi_init_flag) {
+    	    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	}
+	if (!mpi_init_flag || rank == 0) {
+            fprintf(stderr, "show memory info...\n");
+            //usleep(1 * 1000 * 1000);
+	    system("nvidia-smi");
+	}
+#endif
+#endif
         time_monitor_other = 0;
         time_monitor_rho = 0;
         time_monitor_U = 0;
@@ -426,6 +652,19 @@ int main(int argc, char *argv[])
 #endif
     }
 
+#ifdef GPUSolverNew_
+    // clean cuda resources before main() exit.
+    // the destruct order should be reversed from the creation order
+    pEqn_GPU.cleanCudaResources();
+    EEqn_GPU.cleanCudaResources();
+    YEqn_GPU.cleanCudaResources();
+    UEqn_GPU.cleanCudaResources();
+    rhoEqn_GPU.cleanCudaResources();
+    thermo_GPU.cleanCudaResources();
+    dfDataBase.resetAmgxSolvers();
+    dfDataBase.cleanCudaResources();
+#endif
+
     Info<< "End\n" << endl;
 
     return 0;
diff --git a/applications/solvers/dfLowMachFoam/pEqn.H b/applications/solvers/dfLowMachFoam/pEqn.H
deleted file mode 100644
index 34925327f..000000000
--- a/applications/solvers/dfLowMachFoam/pEqn.H
+++ /dev/null
@@ -1,203 +0,0 @@
-if (!pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Thermodynamic density needs to be updated by psi*d(p) after the
-// pressure solution
-const volScalarField psip0(psi*p);
-
-#ifdef GPUSolver_
-    // UEqn.H()
-    start1 = std::clock();
-    volVectorField UEqn_H
-    (
-        IOobject
-        (
-            "H("+U.name()+')',
-            runTime.timeName(),
-            mesh,
-            IOobject::NO_READ,
-            IOobject::NO_WRITE
-        ),
-        mesh,
-        dimensionedVector(dimensionSet(1,-2,-2,0,0,0,0), Zero),
-        extrapolatedCalculatedFvPatchScalarField::typeName
-    );
-    UEqn_GPU.H(&UEqn_H[0][0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    UEqn_H.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_H_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    // UEqn.A()
-    start1 = std::clock();
-    volScalarField UEqn_A
-    (
-        IOobject
-        (
-            "A("+U.name()+')',
-            runTime.timeName(),
-            mesh,
-            IOobject::NO_READ,
-            IOobject::NO_WRITE
-        ),
-        mesh,
-        dimensionedScalar(dimensionSet(1,-3,-1,0,0,0,0), Zero),
-        extrapolatedCalculatedFvPatchScalarField::typeName
-    );
-    UEqn_GPU.A(&UEqn_A[0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-    start1 = std::clock();
-    UEqn_A.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_A_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-start2 = std::clock();
-#ifdef GPUSolver_
-    volScalarField rAU(1.0/UEqn_A);
-    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
-    volVectorField HbyA(constrainHbyA(rAU*UEqn_H, U, p));
-#else
-    volScalarField rAU(1.0/UEqn.A());
-    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
-    volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
-
-    if (pimple.nCorrPiso() <= 1)
-    {
-        tUEqn.clear();
-    }
-#endif
-
-surfaceScalarField phiHbyA
-(
-    "phiHbyA",
-    fvc::interpolate(rho)*fvc::flux(HbyA)
-  + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
-);
-
-fvc::makeRelative(phiHbyA, rho, U);
-
-// Update the pressure BCs to ensure flux consistency
-constrainPressure(p, rho, U, phiHbyA, rhorAUf);
-
-if (pimple.transonic())
-{
-    surfaceScalarField phid
-    (
-        "phid",
-        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
-    );
-
-    phiHbyA -= fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
-
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA) + fvm::div(phid, p)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
-
-        // Relax the pressure equation to ensure diagonal-dominance
-        pEqn.relax();
-
-        start1 = std::clock();
-        pEqn.solve();
-        end1 = std::clock();
-        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-else
-{
-    fvScalarMatrix pDDtEqn
-    (
-        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
-      + fvc::div(phiHbyA)
-    );
-
-    while (pimple.correctNonOrthogonal())
-    {
-        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
-
-        start1 = std::clock();
-        pEqn.solve();
-        end1 = std::clock();
-        time_monitor_pEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-        if (pimple.finalNonOrthogonalIter())
-        {
-            phi = phiHbyA + pEqn.flux();
-        }
-    }
-}
-
-bool limitedp = pressureControl.limit(p);
-
-// Thermodynamic density update
-thermo.correctRho(psi*p - psip0);
-
-if (limitedp)
-{
-    rho = thermo.rho();
-}
-
-#include "rhoEqn.H"
-#include "compressibleContinuityErrs.H"
-
-// Explicitly relax pressure for momentum corrector
-p.relax();
-
-U = HbyA - rAU*fvc::grad(p);
-U.correctBoundaryConditions();
-K = 0.5*magSqr(U);
-
-#ifdef GPUSolver_
-    start1 = std::clock();
-    UEqn_GPU.correctPsi(&U[0][0]);
-    end1 = std::clock();
-    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_UEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-#endif
-
-if (pimple.simpleRho())
-{
-    rho = thermo.rho();
-}
-
-// Correct rhoUf if the mesh is moving
-fvc::correctRhoUf(rhoUf, rho, U, phi);
-
-if (thermo.dpdt())
-{
-    dpdt = fvc::ddt(p);
-
-    if (mesh.moving())
-    {
-        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
-    }
-}
-end2 = std::clock();
-time_monitor_pEqn += double(end2 - start2) / double(CLOCKS_PER_SEC);
-
diff --git a/applications/solvers/dfLowMachFoam/pEqn_CPU.H b/applications/solvers/dfLowMachFoam/pEqn_CPU.H
new file mode 100644
index 000000000..aa379dde7
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/pEqn_CPU.H
@@ -0,0 +1,129 @@
+if (!pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Thermodynamic density needs to be updated by psi*d(p) after the
+// pressure solution
+const volScalarField psip0(psi*p);
+
+volScalarField rAU(1.0/UEqn.A());
+surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+
+if (pimple.nCorrPiso() <= 1)
+{
+    tUEqn.clear();
+}
+
+surfaceScalarField phiHbyA
+(
+    "phiHbyA",
+    fvc::interpolate(rho)*fvc::flux(HbyA)
+  + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
+);
+
+fvc::makeRelative(phiHbyA, rho, U);
+
+label flag_mpi_init;
+MPI_Initialized(&flag_mpi_init);
+if(flag_mpi_init) MPI_Barrier(PstreamGlobals::MPI_COMM_FOAM);
+
+// Update the pressure BCs to ensure flux consistency
+constrainPressure(p, rho, U, phiHbyA, rhorAUf);
+//start = std::clock();
+if (pimple.transonic())
+{
+    surfaceScalarField phid
+    (
+        "phid",
+        (fvc::interpolate(psi)/fvc::interpolate(rho))*phiHbyA
+    );
+
+    phiHbyA -= fvc::interpolate(psi*p)*phiHbyA/fvc::interpolate(rho);
+
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + psi*correction(fvm::ddt(p))
+      + fvc::div(phiHbyA) + fvm::div(phid, p)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+
+        // Relax the pressure equation to ensure diagonal-dominance
+        pEqn.relax();
+
+        pEqn.solve();
+
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+}
+else
+{
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + 
+        psi*correction(fvm::ddt(p))
+        + fvc::div(phiHbyA)
+    );
+
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+        
+ 
+        pEqn.solve();
+
+        
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+    }
+
+}
+
+
+bool limitedp = pressureControl.limit(p);
+
+// Thermodynamic density update
+thermo.correctRho(psi*p - psip0);
+
+
+if (limitedp)
+{
+    rho = thermo.rho();
+}
+
+#include "rhoEqn.H"
+#include "compressibleContinuityErrs.H"
+
+// Explicitly relax pressure for momentum corrector
+// p.relax();
+
+U = HbyA - rAU*fvc::grad(p);
+U.correctBoundaryConditions();
+K = 0.5*magSqr(U);
+
+if (pimple.simpleRho())
+{
+    rho = thermo.rho();
+}
+
+// Correct rhoUf if the mesh is moving
+fvc::correctRhoUf(rhoUf, rho, U, phi);
+
+if (thermo.dpdt())
+{
+    dpdt = fvc::ddt(p);
+
+    if (mesh.moving())
+    {
+        dpdt -= fvc::div(fvc::meshPhi(rho, U), p);
+    }
+}
\ No newline at end of file
diff --git a/applications/solvers/dfLowMachFoam/pEqn_GPU.H b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
new file mode 100644
index 000000000..3dd060955
--- /dev/null
+++ b/applications/solvers/dfLowMachFoam/pEqn_GPU.H
@@ -0,0 +1,225 @@
+thermo_GPU.updateRho();
+
+// Thermodynamic density needs to be updated by psi*d(p) after the
+// pressure solution
+thermo_GPU.psip0();
+
+UEqn_GPU.getHbyA();
+pEqn_GPU.process();
+UEqn_GPU.sync();
+
+#if defined DEBUG_
+    if (!pimple.simpleRho())
+    {
+        rho = thermo.rho();
+    }
+
+    const volScalarField psip0(psi*p);
+
+    volScalarField rAU(1/UEqn.A());
+    surfaceScalarField rhorAUf("rhorAUf", fvc::interpolate(rho*rAU));
+    volVectorField HbyA(constrainHbyA(rAU*UEqn.H(), U, p));
+
+    double *h_boundary_rAU = new double[dfDataBase.num_boundary_surfaces];
+    double *h_boundary_rhorAUf = (double*)calloc(dfDataBase.num_boundary_surfaces, sizeof(double));
+    double *h_boundary_HbyA = new double[3 * dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(rAU.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchrAU = rAU.boundaryField()[patchi];
+        const fvPatchVectorField& patchHbyA = HbyA.boundaryField()[patchi];
+        const fvsPatchScalarField& patchrhorAUf = rhorAUf.boundaryField()[patchi];
+        int patchSize = patchrAU.size();
+
+        if (patchrAU.type() == "processor"
+            || patchrAU.type() == "processorCyclic") {
+            memcpy(h_boundary_rAU + offset, &patchrAU[0], patchSize*sizeof(double));
+            scalarField patchrAUInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchrAU).patchInternalField()();
+            memcpy(h_boundary_rAU + offset + patchSize, &patchrAUInternal[0], patchSize*sizeof(double));
+
+            memcpy(h_boundary_rhorAUf + offset, &patchrhorAUf[0], patchSize*sizeof(double));
+
+            memcpy(h_boundary_HbyA + offset * 3, &patchHbyA[0][0], patchSize*3*sizeof(double));
+            vectorField patchHbyAInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchHbyA).patchInternalField()();
+            memcpy(h_boundary_HbyA + offset * 3 + patchSize * 3, &patchHbyAInternal[0][0], patchSize*3*sizeof(double));
+
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_rAU + offset, &patchrAU[0], patchSize*sizeof(double));
+            memcpy(h_boundary_rhorAUf + offset, &patchrhorAUf[0], patchSize*sizeof(double));
+            memcpy(h_boundary_HbyA + offset * 3, &patchHbyA[0][0], patchSize*3*sizeof(double));
+            offset += patchSize;
+        }
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // UEqn_GPU.compareHbyA(&HbyA[0][0], h_boundary_HbyA, false);
+        // UEqn_GPU.comparerAU(&rAU[0], h_boundary_rAU, false);
+        // pEqn_GPU.comparerhorAUf(&rhorAUf[0], h_boundary_rhorAUf, false);
+    }
+
+    delete h_boundary_rAU;
+    delete h_boundary_rhorAUf;
+    delete h_boundary_HbyA;
+#endif
+
+
+#if defined DEBUG_
+    surfaceScalarField phiHbyA
+    (
+        "phiHbyA",
+        fvc::interpolate(rho)*fvc::flux(HbyA)
+    + rhorAUf*fvc::ddtCorr(rho, U, phi, rhoUf)
+    );
+
+    double *h_boundary_phiHbyA = (double*)calloc(dfDataBase.num_boundary_surfaces, sizeof(double));
+    offset = 0;
+    forAll(phiHbyA.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchphiHbyA = phiHbyA.boundaryField()[patchi];
+        int patchSize = patchphiHbyA.size();
+        if (patchphiHbyA.type() == "processor"
+            || patchphiHbyA.type() == "processorCyclic") {
+            memcpy(h_boundary_phiHbyA + offset, &patchphiHbyA[0], patchSize*sizeof(double));
+            offset += 2 * patchSize;
+        } else {
+            memcpy(h_boundary_phiHbyA + offset, &patchphiHbyA[0], patchSize*sizeof(double));
+            offset += patchSize;
+        }
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // pEqn_GPU.comparephiHbyA(&phiHbyA[0], h_boundary_phiHbyA, false);
+    }
+    delete h_boundary_phiHbyA;
+#endif
+
+#if defined DEBUG_
+    fvScalarMatrix pDDtEqn
+    (
+        fvc::ddt(rho) + 
+        psi*correction(fvm::ddt(p))
+        + fvc::div(phiHbyA)
+    );
+    while (pimple.correctNonOrthogonal())
+    {
+        fvScalarMatrix pEqn(pDDtEqn - fvm::laplacian(rhorAUf, p));
+        pEqn.solve();
+        if (pimple.finalNonOrthogonalIter())
+        {
+            phi = phiHbyA + pEqn.flux();
+        }
+        thermo.correctRho(psi*p - psip0);
+        // compare pEqn
+        std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+        std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+        offset = 0;
+        for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+        {
+            const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+            int patchsize = dfDataBase.patch_size[patchi];
+            const double* internal_coeff_ptr = &pEqn.internalCoeffs()[patchi][0];
+            const double* boundary_coeff_ptr = &pEqn.boundaryCoeffs()[patchi][0];
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchsize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchsize * sizeof(double));
+            if (patchP.type() == "processor" || patchP.type() == "processorCyclic") offset += 2 * patchsize;
+            else offset += patchsize;
+        }
+        if (!mpi_init_flag || rank == 0) {
+            // pEqn_GPU.compareResult(&pEqn.lower()[0], &pEqn.upper()[0], &pEqn.diag()[0], &pEqn.source()[0], 
+            //         h_internal_coeffs.data(), h_boundary_coeffs.data(), false);
+        }
+    }
+
+    // compare p
+    double *h_boundary_p = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(p.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchP = p.boundaryField()[patchi];
+        int patchsize = patchP.size();
+        if (patchP.type() == "processor"
+            || patchP.type() == "processorCyclic") {
+            memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+            scalarField patchPInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchP).patchInternalField()();
+            memcpy(h_boundary_p + offset + patchsize, &patchPInternal[0], patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_p + offset, &patchP[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    // pEqn_GPU.correctP(&p[0], h_boundary_p);
+    if (!mpi_init_flag || rank == 0) {
+        //pEqn_GPU.comparep(&p[0], h_boundary_p, false);
+    }
+    delete h_boundary_p;
+
+    // compare phi
+    double *h_boundary_phi = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(phi.boundaryField(), patchi)
+    {
+        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
+        int patchSize = patchFlux.size();
+        memcpy(h_boundary_phi + offset, &patchFlux[0], patchSize*sizeof(double));
+        if (patchFlux.type() == "processor"
+            || patchFlux.type() == "processorCyclic") {
+            memset(h_boundary_phi + offset + patchSize, 0, patchSize*sizeof(double));
+            offset += 2 * patchSize;
+        } else {
+            offset += patchSize;
+        }
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // pEqn_GPU.comparephi(&phi[0], h_boundary_phi, false);
+    }
+    delete h_boundary_phi;
+#endif
+
+thermo_GPU.correctPsipRho();
+
+#include "rhoEqn.H"
+// #include "compressibleContinuityErrs.H" // TODO: implement this func in future
+
+#if defined DEBUG_
+
+    U = HbyA - rAU*fvc::grad(p);
+    U.correctBoundaryConditions();
+    K = 0.5*magSqr(U);
+
+    // check U
+    double *h_boundary_u_tmp_inp = new double[dfDataBase.num_boundary_surfaces * 3];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchSize = patchU.size();
+
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            memcpy(h_boundary_u_tmp_inp + 3*offset, &patchU[0][0], patchSize*sizeof(double)*3);
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u_tmp_inp + 3*offset + 3*patchSize, &patchUInternal[0][0], patchSize*sizeof(double)*3);
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_u_tmp_inp + 3*offset, &patchU[0][0], patchSize*sizeof(double)*3);
+            offset += patchSize;
+        }
+    }
+    if (!mpi_init_flag || rank == 0) {
+        // pEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp_inp, false);
+    }
+    delete h_boundary_u_tmp_inp;
+#endif
+
+#if defined DEBUG_ 
+    dpdt = fvc::ddt(p);
+    // pEqn_GPU.comparedpdt(&dpdt[0], false);
+#endif
+
+// #undef CPUSolver_
+// #define GPUSolverNew_
diff --git a/applications/solvers/dfLowMachFoam/rhoEqn.H b/applications/solvers/dfLowMachFoam/rhoEqn.H
index 93965ca52..2f2d679b8 100644
--- a/applications/solvers/dfLowMachFoam/rhoEqn.H
+++ b/applications/solvers/dfLowMachFoam/rhoEqn.H
@@ -28,41 +28,62 @@ Description
     Solve the continuity for density.
 
 \*---------------------------------------------------------------------------*/
-#ifdef GPUSolver_
-{
-    start1 = std::clock();
-    rho.oldTime();
+#ifdef GPUSolverNew_
+    double *h_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::internal);
+    double *h_boundary_rho = dfDataBase.getFieldPointer("rho", location::cpu, position::boundary);
 
-    int offset = 0;
-    forAll(U.boundaryField(), patchi)
-    {
-        const fvsPatchScalarField& patchFlux = phi.boundaryField()[patchi];
-        int patchSize = patchFlux.size();
-        memcpy(boundary_phi_init+offset, &patchFlux[0], patchSize*sizeof(double));
-        offset += patchSize;
+    TICK_START;
+    rhoEqn_GPU.process();
+    rhoEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // rhoEqn_GPU.postProcess(h_rho);
+    // rho.oldTime();
+    // memcpy(&rho[0], h_rho, dfDataBase.cell_value_bytes);
+    // rho.correctBoundaryConditions();
+
+
+#ifdef DEBUG_
+    // checkValue
+    TICK_START;
+    fvScalarMatrix rhoEqn
+    (
+        fvm::ddt(rho)
+      + fvc::div(phi)
+    );
+    rhoEqn.solve();
+    TICK_STOP(CPU process time);
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     }
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly_CPU_prepare += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-    start1 = std::clock();
-    rhoEqn_GPU.initializeTimeStep();
-    rhoEqn_GPU.fvc_div(&phi[0], boundary_phi_init);
-    rhoEqn_GPU.fvm_ddt(&rho.oldTime()[0]);
-    rhoEqn_GPU.sync();
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_mtxAssembly_GPU_run += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    // if (!mpi_init_flag || rank == 0) {
+    //     rhoEqn_GPU.compareResult(&rhoEqn.diag()[0], &rhoEqn.source()[0], false);
+    // }
 
-    start1 = std::clock();
-    rhoEqn_GPU.updatePsi(&rho.primitiveFieldRef()[0]);
-    rho.correctBoundaryConditions();
-    end1 = std::clock();
-    time_monitor_rhoEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-    time_monitor_rhoEqn_correctBC += double(end1 - start1) / double(CLOCKS_PER_SEC);
-}
+    offset = 0;
+    forAll(rho.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchRho = rho.boundaryField()[patchi];
+        int patchsize = patchRho.size();
+        if (patchRho.type() == "processor"
+            || patchRho.type() == "processorCyclic") {
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            scalarField patchRhoInternal = 
+                    dynamic_cast<const processorFvPatchField<scalar>&>(patchRho).patchInternalField()();
+            memcpy(h_boundary_rho + offset + patchsize, &patchRhoInternal[0], patchsize * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_rho + offset, &patchRho[0], patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+    // if (!mpi_init_flag || rank == 0) {
+    //     rhoEqn_GPU.compareRho(&rho[0], h_boundary_rho, false);
+    // }
+#endif
 #else
 {
     start1 = std::clock();
@@ -82,5 +103,6 @@ Description
     time_monitor_rhoEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
 }
 #endif
-
+//#undef CPUSolver_
+//#define GPUSolverNew_
 // ************************************************************************* //
diff --git a/bashrc.in b/bashrc.in
index 8f517d87f..ac4e7d398 100644
--- a/bashrc.in
+++ b/bashrc.in
@@ -15,4 +15,5 @@ export DF_LIBBIN=pwd/platforms/$WM_OPTIONS/lib
 export PATH=$DF_APPBIN:$PATH
 export LD_LIBRARY_PATH=$DF_LIBBIN:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
-export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
\ No newline at end of file
+export LD_LIBRARY_PATH=$AMGX_DIR/build:$LD_LIBRARY_PATH
+export LD_LIBRARY_PATH=$DF_ROOT/src/dfChemistryModel/DNNInferencer/build:$LD_LIBRARY_PATH
\ No newline at end of file
diff --git a/install.sh b/install.sh
index 7a699bb25..38ec26478 100755
--- a/install.sh
+++ b/install.sh
@@ -29,7 +29,7 @@ if [ $USE_GPUSOLVER = true ]; then
     mkdir build
     cd build
     cmake ..
-    make 
+    make -j
     export LD_LIBRARY_PATH=$DF_ROOT/src_gpu/build:$LD_LIBRARY_PATH
 fi
 cd $DF_ROOT
diff --git a/src/dfChemistryModel/dfChemistryModel.C b/src/dfChemistryModel/dfChemistryModel.C
index 20a785aa3..ee57bbb5c 100644
--- a/src/dfChemistryModel/dfChemistryModel.C
+++ b/src/dfChemistryModel/dfChemistryModel.C
@@ -512,19 +512,20 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
 
                 mu_[celli] = mixture_.CanteraTransport()->viscosity(); // Pa-s
 
+                alpha_[celli] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass()); // kg/(m*s)
+                // thermalConductivity() W/m/K
+                // cp_mass()   J/kg/K
+
                 if (mixture_.transportModelName() == "UnityLewis")
                 {
-                    alpha_[celli] = mu_[celli] / 0.7;
                     forAll(rhoD_, i)
                     {
-                        rhoD_[i][celli] = alpha_[celli];
+                            rhoD_[i][celli] = alpha_[celli];
+                        }
                     }
-                }
+
                 else
                 {
-                    alpha_[celli] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass()); // kg/(m*s)
-                    // thermalConductivity() W/m/K
-                    // cp_mass()   J/kg/K
                     mixture_.CanteraTransport()->getMixDiffCoeffsMass(dTemp_.begin()); // m2/s
 
                     CanteraGas_->getEnthalpy_RT(hrtTemp_.begin()); //hrtTemp_=m_h0_RT non-dimension
@@ -593,9 +594,10 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
 
                     pmu[facei] = mixture_.CanteraTransport()->viscosity();
 
+                    palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
+
                     if (mixture_.transportModelName() == "UnityLewis")
                     {
-                        palpha[facei] = pmu[facei] / 0.7;
                         forAll(rhoD_, i)
                         {
                             rhoD_[i].boundaryFieldRef()[patchi][facei] = palpha[facei];
@@ -603,7 +605,6 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
                     }
                     else
                     {
-                        palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
                         mixture_.CanteraTransport()->getMixDiffCoeffsMass(dTemp_.begin());
 
                         CanteraGas_->getEnthalpy_RT(hrtTemp_.begin());
@@ -698,9 +699,10 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
 
                         pmu[facei] = mixture_.CanteraTransport()->viscosity();
 
+                        palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
+
                         if (mixture_.transportModelName() == "UnityLewis")
                         {
-                            palpha[facei] = pmu[facei] / 0.7;
                             forAll(rhoD_, i)
                             {
                                 rhoD_[i].boundaryFieldRef()[patchi][facei] = palpha[facei];
@@ -708,7 +710,6 @@ void Foam::dfChemistryModel<ThermoType>::correctThermo()
                         }
                         else
                         {
-                            palpha[facei] = mixture_.CanteraTransport()->thermalConductivity()/(CanteraGas_->cp_mass());
                             mixture_.CanteraTransport()->getMixDiffCoeffsMass(dTemp_.begin());
 
                             CanteraGas_->getEnthalpy_RT(hrtTemp_.begin());
diff --git a/src/dfChemistryModel/dfChemistryModel.H b/src/dfChemistryModel/dfChemistryModel.H
index c10035380..097aaa6d0 100644
--- a/src/dfChemistryModel/dfChemistryModel.H
+++ b/src/dfChemistryModel/dfChemistryModel.H
@@ -354,6 +354,8 @@ public:
             }
         }
 
+        bool ifChemstry() const {return chemistry_;}
+
     // profiling
 #if defined USE_LIBTORCH || defined USE_PYTORCH
     double time_allsolve() {return time_allsolve_;}
diff --git a/src_gpu/AmgXSolver.H b/src_gpu/AmgXSolver.H
index 190808934..8eb194a6f 100644
--- a/src_gpu/AmgXSolver.H
+++ b/src_gpu/AmgXSolver.H
@@ -27,31 +27,8 @@
 // PETSc
 // # include <petscvec.h>
 
-
-/** \brief A macro to check the returned CUDA error code.
- *
- * \param call [in] Function call to CUDA API.
- */
-# define CHECK(call)                                                        \
-do                                                    \
-{                                                     \
-    const cudaError_t error_code = call;              \
-    if (error_code != cudaSuccess)                    \
-    {                                                 \
-        printf("CUDA Error:\n");                      \
-        printf("    File:       %s\n", __FILE__);     \
-        printf("    Line:       %d\n", __LINE__);     \
-        printf("    Error code: %d\n", error_code);   \
-        printf("    Error text: %s\n",                \
-            cudaGetErrorString(error_code));          \
-        exit(1);                                      \
-    }                                                 \
-} while (0)
-
-
-
-
-
+// mpi
+#include <mpi.h>
 
 /** \brief A wrapper class for coupling PETSc and AmgX.
  *
@@ -100,14 +77,15 @@ class AmgXSolver
 
         /** \brief Construct a AmgXSolver instance.
          *
-         * \param comm [in] MPI communicator.
          * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
          * \param cfgFile [in] A string; the path to AmgX configuration file.
+         * \param devID [in] An integer; the ID of the GPU device to be used.
          */
         AmgXSolver
         (
             const std::string &modeStr,
-            const std::string &cfgFile
+            const std::string &cfgFile,
+            int devID
         );
 
         /** \brief Destructor. */
@@ -115,15 +93,16 @@ class AmgXSolver
 
         /** \brief Initialize a AmgXSolver instance.
          *
-         * \param comm [in] MPI communicator.
          * \param modeStr [in] A string; target mode of AmgX (e.g., dDDI).
          * \param cfgFile [in] A string; the path to AmgX configuration file.
+         * \param devID [in] An integer; the ID of the GPU device to be used.
          *
          */
         void initialize
         (
             const std::string &modeStr,
-            const std::string &cfgFile
+            const std::string &cfgFile,
+            const int devID
         );
 
 
@@ -153,6 +132,7 @@ class AmgXSolver
         void setOperator
         (
             const int nRows,
+            const int nGlobalRows,
             const int nNz,
             const int *rowIndex,
             const int *colIndex,
@@ -258,9 +238,18 @@ class AmgXSolver
         /** \brief A flag indicating if this instance has been initialized. */
         bool                    isInitialised = false;
 
+        /** \brief A flag indicating whether MPI is enabled */
+        int                     isMPIEnabled;
+        
         /** \brief A parameter used by AmgX. */
         int                     ring;
 
+        /** \brief rank size in global communicator */
+        int                     mpiSize;
+
+        /** \brief rank size in global communicator */
+        MPI_Comm                mpiWorld;
+
         /** \brief AmgX solver mode. */
         AMGX_Mode               mode;
 
@@ -302,8 +291,9 @@ class AmgXSolver
          * is in charge of initializing AmgX and the resource instance.
          *
          * \param cfgFile [in] Path to AmgX solver configuration file.
+         * \param devID [in] ID of the GPU device to be used.
          */
-        void initAmgX(const std::string &cfgFile);
+        void initAmgX(const std::string &cfgFile, int devID);
 };
 
 #endif
diff --git a/src_gpu/AmgXSolver.cu b/src_gpu/AmgXSolver.cu
index b0076e5c3..909a0abd9 100644
--- a/src_gpu/AmgXSolver.cu
+++ b/src_gpu/AmgXSolver.cu
@@ -13,6 +13,8 @@
 #include "AmgXSolver.H"
 #include <numeric>
 #include <limits>
+#include <mpi.h>
+#include "dfMatrixDataBase.H"
 
 // initialize AmgXSolver::count to 0
 int AmgXSolver::count = 0;
@@ -22,9 +24,9 @@ AMGX_resources_handle AmgXSolver::rsrc = nullptr;
 
 
 /* \implements AmgXSolver::AmgXSolver */
-AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile)
+AmgXSolver::AmgXSolver(const std::string &modeStr, const std::string &cfgFile, const int devID)
 {
-    initialize(modeStr, cfgFile);
+    initialize(modeStr, cfgFile, devID);
 }
 
 
@@ -36,7 +38,7 @@ AmgXSolver::~AmgXSolver()
 
 
 /* \implements AmgXSolver::initialize */
-void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile)
+void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFile, int devID)
 {
     
     // if this instance has already been initialized, skip
@@ -52,8 +54,15 @@ void AmgXSolver::initialize(const std::string &modeStr, const std::string &cfgFi
     // get the mode of AmgX solver
     setMode(modeStr);  
 
+    // check if MPI has been initialized
+    MPI_Initialized(&isMPIEnabled);
+    if (isMPIEnabled) {
+        MPI_Comm_size(MPI_COMM_WORLD, &mpiSize);
+        mpiWorld = MPI_COMM_WORLD;
+    }
+
     // initialize AmgX
-    initAmgX(cfgFile);  
+    initAmgX(cfgFile, devID);
 
     // a bool indicating if this instance is initialized
     isInitialised = true;
@@ -84,7 +93,7 @@ void AmgXSolver::setMode(const std::string &modeStr)
 
 
 /* \implements AmgXSolver::initAmgX */
- void AmgXSolver::initAmgX(const std::string &cfgFile)
+ void AmgXSolver::initAmgX(const std::string &cfgFile, int devID)
 {
     // only the first instance (AmgX solver) is in charge of initializing AmgX
     if (count == 1)
@@ -106,7 +115,13 @@ void AmgXSolver::setMode(const std::string &modeStr)
     AMGX_SAFE_CALL(AMGX_config_add_parameters(&cfg, "exception_handling=1"));
 
     // create an AmgX resource object, only the first instance is in charge
-    if (count == 1) AMGX_resources_create_simple(&rsrc, cfg);
+    if (count == 1) {
+        if (isMPIEnabled) {
+            AMGX_resources_create(&rsrc, cfg, &mpiWorld, 1, &devID);
+        } else {
+            AMGX_resources_create_simple(&rsrc, cfg);
+        }
+    }
 
     // create AmgX vector object for unknowns and RHS
     AMGX_vector_create(&AmgXP, rsrc, mode);
@@ -169,6 +184,7 @@ void AmgXSolver::finalize()
 void AmgXSolver::setOperator
 (
     const int nRows,
+    const int nGlobalRows,
     const int nNz,
     const int *rowIndex,
     const int *colIndex,
@@ -195,16 +211,56 @@ void AmgXSolver::setOperator
         exit(0);
     }
 
-    // upload matrix A to AmgX
-    AMGX_matrix_upload_all(
-        AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr);
-
-    // bind the matrix A to the solver
-    AMGX_solver_setup(solver, AmgXA);
-
-    // connect (bind) vectors to the matrix
-    AMGX_vector_bind(AmgXP, AmgXA);
-    AMGX_vector_bind(AmgXRHS, AmgXA);
+    // check if mpi initialize
+    if (!isMPIEnabled)
+    {
+        // upload matrix A to AmgX
+        AMGX_matrix_upload_all(
+            AmgXA, nRows, nNz, 1, 1, rowIndex, colIndex, value, nullptr);
+
+        // bind the matrix A to the solver
+        AMGX_solver_setup(solver, AmgXA);
+
+        // connect (bind) vectors to the matrix
+        AMGX_vector_bind(AmgXP, AmgXA);
+        AMGX_vector_bind(AmgXRHS, AmgXA);
+    } else {
+        MPI_Barrier(MPI_COMM_WORLD);
+
+        AMGX_distribution_handle dist;
+        AMGX_distribution_create(&dist, cfg);
+
+        // Must persist until after we call upload
+        std::vector<int> offsets(mpiSize + 1, 0);
+
+        // Determine the number of rows per GPU
+        std::vector<int> nRowsPerGPU(mpiSize);
+        MPI_Allgather(&nRows, 1, MPI_INT, nRowsPerGPU.data(), 1, MPI_INT, MPI_COMM_WORLD);
+
+        // Calculate the global offsets
+        std::partial_sum(nRowsPerGPU.begin(), nRowsPerGPU.end(), offsets.begin() + 1);
+        
+        AMGX_distribution_set_partition_data(
+            dist, AMGX_DIST_PARTITION_OFFSETS, offsets.data());
+        
+        // Set the column indices size, 32- / 64-bit
+        AMGX_distribution_set_32bit_colindices(dist, true);
+
+        AMGX_matrix_upload_distributed(
+            AmgXA, nGlobalRows, nRows, nNz, 1, 1, rowIndex,
+            colIndex, value, nullptr, dist);
+        
+        AMGX_distribution_destroy(dist);
+
+        // bind the matrix A to the solver
+        AMGX_solver_setup(solver, AmgXA);
+
+        // connect (bind) vectors to the matrix
+        AMGX_vector_bind(AmgXP, AmgXA);
+        AMGX_vector_bind(AmgXRHS, AmgXA);
+
+        MPI_Barrier(MPI_COMM_WORLD); 
+    }
 }
 
 
@@ -250,6 +306,8 @@ void AmgXSolver::solve(
     AMGX_vector_upload(AmgXP, nRows, 1, psi);
     AMGX_vector_upload(AmgXRHS, nRows, 1, rhs);
 
+    if (isMPIEnabled) MPI_Barrier(MPI_COMM_WORLD); 
+
     // Solve
     AMGX_solver_solve(solver, AmgXRHS, AmgXP);
 
@@ -268,13 +326,16 @@ void AmgXSolver::solve(
     // Download data from device
     AMGX_vector_download(AmgXP, psi);
 
+    if (isMPIEnabled) MPI_Barrier(MPI_COMM_WORLD);
+
     // get norm and iteration number
     double irnorm = 0., rnorm = 0.;
     int nIters = 0;
     getResidual(0, irnorm);
     getIters(nIters);
     getResidual(nIters, rnorm);
-    printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
+    if (!isMPIEnabled || myRank == 0)
+        printf("Initial residual = %.10lf, Final residual = %.5e, No Iterations %d\n", irnorm, rnorm, nIters);
 
 }
 
diff --git a/src_gpu/CMakeLists.txt b/src_gpu/CMakeLists.txt
index 6e4a7efef..57058eeef 100644
--- a/src_gpu/CMakeLists.txt
+++ b/src_gpu/CMakeLists.txt
@@ -6,12 +6,16 @@ cmake_minimum_required(VERSION 3.5)
 project(dfMatrix LANGUAGES CXX CUDA)
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_PREFIX_PATH /root/libtorch)
 
 find_package(CUDA REQUIRED)
 find_package(MPI REQUIRED)
 find_package(CUDAToolkit REQUIRED)
+find_package(Torch REQUIRED)
 find_library(LIBAMGXSH amgxsh PATHS $ENV{AMGX_DIR}/build)
 
+add_compile_options(-arch=sm_70 -fmad=false)
+
 include_directories(
     ${MPI_INCLUDE_PATH}
     ${CUDA_INCLUDE_DIRS}
@@ -19,21 +23,30 @@ include_directories(
 )
 
 add_library(${PROJECT_NAME} 
-    SHARED 
-        dfUEqn.cu 
-        dfRhoEqn.cu 
+    SHARED
+        AmgXSolver.cu
+        dfMatrixDataBase.cu
+        dfMatrixOpBase.cu
+        dfNcclBase.cu
+        dfUEqn.cu
         dfYEqn.cu
         dfEEqn.cu
-        AmgXSolver.cu
-        dfMatrixDataBase.cu)
+        dfRhoEqn.cu
+        dfpEqn.cu
+        dfThermo.cu
+        dfChemistrySolver.cu)
 
 target_link_libraries(${PROJECT_NAME}
     ${MPI_LIBRARIES}
     ${CUDA_LIBRARIES}
     ${LIBAMGXSH}
+    ${TORCH_LIBRARIES}
 )
 target_compile_options(dfMatrix PUBLIC -g)
 option(DFMATRIX_ENABLE_DETAILED_DEBUG "Enable detailed debug build" OFF)
 if (DFMATRIX_ENABLE_DETAILED_DEBUG)
-    target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG)
+    target_compile_definitions(${PROJECT_NAME} PRIVATE DEBUG_)
 endif()
+
+# target_compile_definitions(${PROJECT_NAME} PRIVATE STREAM_ALLOCATOR)
+
diff --git a/src_gpu/GPUMesh.H b/src_gpu/GPUMesh.H
deleted file mode 100644
index 22cc05ed8..000000000
--- a/src_gpu/GPUMesh.H
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <iostream>
-
-class GPUMesh
-{
-public:
-    int num_cells;
-    int num_faces;
-    int boundary_cells;
-    //... all variables needed to upload once
-
-public:
-    GPUMesh();
-    ~GPUMesh();
-};
-
-GPUMesh::GPUMesh()
-{
-    // same to the constructor of dfMatrix.C
-}
-
-GPUMesh::~GPUMesh()
-{
-}
diff --git a/src_gpu/GPUfield.H b/src_gpu/GPUfield.H
deleted file mode 100644
index d81afbb01..000000000
--- a/src_gpu/GPUfield.H
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
-GPU field
-UEqn
-GPUField
-rho_old, rho_new, vector_old, phi, p, nuEff, 
-p_bou, vector_old_bou, nuEff_bou, rho_new_bou, 
-ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init
-1. initialize a map
-2. implement methods to construct GPU pointer and return GPU pointer
-
-fvm__ddt(UEqn, rho, U)
-{
-    rho.oldtime()
-    rho.
-}
-*/
-#include <iostream>
-#include <unordered_map>
-
-#define TOSTRING(x) #x
-
-struct GPUField {
-    double* cur_internal;
-    double* cur_boundary;
-    double* old_internal;
-    double* old_boundary;
-};
-std::unordered_map<std::string, GPUField> GPUFields;
-
-// initialize: cudaMalloc, conducted at begining
-void initialize(std::string U, std::string p, std::string phi);
-// Q: 
-// 1. consider the different sizes bettween face values and cell values
-// 2. not all variables need all these four terms
-
-// update at the end of this time step
-// move current pointer as oldTime pointer
-void update();
-
-template<class Type, template<class> class PatchField, class GeoMesh>
-double* cur_internal(Foam::GeometryField<Type, PatchField, GeoMesh> var)
-{
-    if (!GPUFields[TOSTRING(var)].cur_internal) {
-        // 1. cudaMemcopy current internal field
-    }
-    return GPUFields[TOSTRING(var)].cur_internal;
-}
-
-template<class Type, template<class> class PatchField, class GeoMesh>
-double* cur_boundary(Foam::GeometryField<Type, PatchField, GeoMesh> var);
-
-template<class Type, template<class> class PatchField, class GeoMesh>
-double* old_internal(Foam::GeometryField<Type, PatchField, GeoMesh> var);
-
-template<class Type, template<class> class PatchField, class GeoMesh>
-double* old_boundary(Foam::GeometryField<Type, PatchField, GeoMesh> var);
-
diff --git a/src_gpu/GPUfield.cpp b/src_gpu/GPUfield.cpp
deleted file mode 100644
index e79dbdf07..000000000
--- a/src_gpu/GPUfield.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-/*
-GPU field
-UEqn
-GPUField
-rho_old, rho_new, vector_old, phi, p, nuEff, 
-p_bou, vector_old_bou, nuEff_bou, rho_new_bou, 
-ueqn_internalCoeffs, ueqn_boundaryCoeffs_init, ueqn_laplac_internalCoeffs_init, ueqn_laplac_boundaryCoeffs_init
-1. initialize a map
-2. implement methods to construct GPU pointer and return GPU pointer
-
-fvm__ddt(UEqn, rho, U)
-{
-    rho.oldtime()
-    rho.
-}
-*/
-#include <stdio.h>
-
diff --git a/src_gpu/dfChemistrySolver.H b/src_gpu/dfChemistrySolver.H
new file mode 100644
index 000000000..5a5201399
--- /dev/null
+++ b/src_gpu/dfChemistrySolver.H
@@ -0,0 +1,36 @@
+#pragma once
+
+#include <stdio.h>
+#include <unistd.h>
+#include <cuda_runtime.h>
+#include <torch/script.h>
+#include "dfMatrixDataBase.H"
+
+class dfChemistrySolver
+{
+private:
+    dfMatrixDataBase &dataBase_;
+    cudaStream_t stream;
+    std::vector<torch::jit::script::Module> modules_;
+    torch::Device device_;
+    double *Xmu_, *Xstd_, *Ymu_, *Ystd_;
+
+    double *init_input_, *y_input_BCT, *NN_output_; 
+    int *d_reactCellIndex;
+    int dim_input_, num_cells_, num_species_, num_modules_;
+    int batch_size_;
+    double unReactT_;
+    int inputsize_;
+public:
+    dfChemistrySolver(dfMatrixDataBase &dataBase)
+        : device_(torch::kCUDA), dataBase_(dataBase) {};
+    ~dfChemistrySolver();
+
+    void setConstantValue(int num_cells, int num_species, int batch_size);
+    void loadModels(const std::string dir);
+    void loadNormalization(const std::string dir);
+    void Inference(const double *h_T, const double *d_T,const double *p, const double *y,
+            const double *rho, double *RR);
+    
+    void sync();
+};
\ No newline at end of file
diff --git a/src_gpu/dfChemistrySolver.cu b/src_gpu/dfChemistrySolver.cu
new file mode 100644
index 000000000..f4b3b051a
--- /dev/null
+++ b/src_gpu/dfChemistrySolver.cu
@@ -0,0 +1,210 @@
+#include "dfChemistrySolver.H"
+#include "dfMatrixOpBase.H"
+
+__global__ void construct_init_input(int num_thread, int num_cells, int dim, const double *T, const double *p,
+        const int *reactCellIndex, const double *y, double *y_input_BCT, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+
+    int cellIndex = reactCellIndex[index];
+
+    output[index * dim] = T[cellIndex];
+    // output[index * dim + 1] = p[cellIndex];
+    output[index * dim + 1] = 101325.;
+    double y_BCT;
+    for (int i = 0; i < dim - 2; ++i) {
+        y_BCT = (pow(y[i * num_cells + cellIndex], 0.1) - 1) * 10; // BCT: lambda = 0.1
+        output[index * dim + 2 + i] = y_BCT;
+        y_input_BCT[i * num_thread + index] = y_BCT;
+    }
+}
+
+__global__ void normalize_input(int num_thread, int num_cells, int dim, const double *input, 
+        const double *Xmu, const double *Xstd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+
+    for (int i = 0; i < dim; ++i) {
+        output[index * dim + i] = (input[index * dim + i] - Xmu[i]) / Xstd[i];
+    }
+}
+
+__global__ void calculate_y_new(int num_thread, int num_modules, const double *output_init, 
+        const double *y_input_BCT, const double *Ymu, const double *Ystd, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    double RR_tmp;
+    for (int i = 0; i < num_modules; ++i) {
+        RR_tmp = output_init[i * num_thread + index] * Ystd[i] + Ymu[i] + y_input_BCT[i * num_thread + index];
+        RR_tmp = pow((RR_tmp * 0.1 + 1), 10); // rev-BCT: lambda = 0.1
+        output[i * num_thread + index] = RR_tmp;
+    }
+}
+
+__global__ void calculate_RR(int num_thread, int num_cells, int num_species, double delta_t,
+        const int *reactCellIndex, const double *rho, const double *y_old, const double *p, 
+        double *y_NN, double *RR)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+
+    int cellIndex = reactCellIndex[index];
+    
+    // normalize
+    double y_ave = 0.;
+    for (int i = 0; i < num_species - 1; ++i) {
+        y_ave += y_NN[i * num_thread + index];
+    }
+    y_ave += y_old[(num_species - 1) * num_cells + cellIndex];
+    for (int i = 0; i < num_species - 1; ++i) {
+        y_NN[i * num_thread + index] = y_NN[i * num_thread + index] / y_ave;
+        RR[i * num_cells + cellIndex] = (y_NN[i * num_thread + index] - y_old[i * num_cells + cellIndex]) * rho[cellIndex]
+                * (p[cellIndex] / 101325.) / delta_t; // correction
+    }
+}
+
+dfChemistrySolver::~dfChemistrySolver() {
+    cudaFree(init_input_);
+}
+
+void dfChemistrySolver::setConstantValue(int num_cells, int num_species, int batch_size) {
+    this->num_cells_ = num_cells;
+    this->num_species_ = num_species;
+    this->batch_size_ = batch_size;
+    this->stream = dataBase_.stream;
+
+    dim_input_ = num_species + 2; // p, T, y
+    num_modules_ = num_species_ - 1;
+    unReactT_ = 610;
+    cudaMalloc(&Xmu_, sizeof(double) * dim_input_);
+    cudaMalloc(&Xstd_, sizeof(double) * dim_input_);
+    cudaMalloc(&Ymu_, sizeof(double) * num_modules_);
+    cudaMalloc(&Ystd_, sizeof(double) * num_modules_);
+    modules_.reserve(num_modules_);
+
+    // now norm paras are set in constructor manually
+    at::TensorOptions opts = at::TensorOptions().dtype(at::kDouble).device(at::kCUDA);
+    std::vector<double> Xmu_vec = {1.2996375154e+03,  1.4349643303e+05, -4.3678815323e+00,
+            -5.8949183472e+00, -3.8840763486e+00, -5.5436246211e+00,
+            -6.0178199636e+00, -2.1469850084e+00, -6.9828365432e+00,
+            -7.7747568654e+00, -1.8571483828e-01};
+    std::vector<double> Xstd_vec = {3.9612732767e+02, 1.8822821412e+04, 1.1226048640e+00, 6.8397462420e-01,
+            1.8879462146e+00, 1.2433158499e+00, 1.3169176600e+00, 4.3600457243e-01,
+            8.1820904505e-01, 8.0471805333e-01, 6.1020187522e-02};
+    std::vector<double> Ymu_vec = {-0.0101101322, -0.0138129078, -0.0146349442, -0.0088870325,
+            -0.0075195178,  0.0020506931, -0.0103104668, -0.0192603020};
+    std::vector<double> Ystd_vec = {0.0297933161, 0.0802139099, 0.0230954310, 0.1541940427, 
+            0.1316836678, 0.0042975580, 0.1476416977, 0.0860471308};
+    
+    cudaMemcpy(Xmu_, Xmu_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Xstd_, Xstd_vec.data(), sizeof(double) * dim_input_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ymu_, Ymu_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
+    cudaMemcpy(Ystd_, Ystd_vec.data(), sizeof(double) * num_modules_, cudaMemcpyHostToDevice);
+
+    // input modules
+    std::string prefix = "new_Temporary_Chemical_";
+    std::string suffix = ".pt";
+    for (int i = 0; i < num_modules_; ++i) {
+        std::string model_path = prefix + std::to_string(i) + suffix;
+        try {
+            modules_.push_back(torch::jit::load(model_path));
+        }
+        catch (const c10::Error& e) {
+            std::cerr << "error loading the model\n";
+            exit(-1);
+        }
+        // modules_[i].to(device_);
+        modules_[i].to(device_, torch::kHalf);
+    }
+}
+
+void dfChemistrySolver::Inference(const double *h_T, const double *d_T,const double *p, const double *y,
+        const double *rho, double *RR) {
+    // construct input
+    clock_t start = clock();
+    inputsize_ = 0;
+    std::vector<int> reactCellIndex;
+    for (int i = 0; i < num_cells_; i++) {
+        if (h_T[i] >= unReactT_) {
+            reactCellIndex.push_back(i);
+        }
+    }
+    inputsize_ = reactCellIndex.size();
+    clock_t end = clock();
+    double elapsed_secs = double(end - start) / CLOCKS_PER_SEC;
+    std::cout << "construct input time: " << elapsed_secs << std::endl;
+
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&init_input_, sizeof(double) * inputsize_ * dim_input_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&y_input_BCT, sizeof(double) * inputsize_ * num_species_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&NN_output_, sizeof(double) * inputsize_ * num_species_, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_reactCellIndex, sizeof(int) * inputsize_, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice, stream));
+#else
+    cudaMalloc(&init_input_, sizeof(double) * inputsize_ * dim_input_);
+    cudaMalloc(&y_input_BCT, sizeof(double) * inputsize_ * num_species_);
+    cudaMalloc(&d_reactCellIndex, sizeof(int) * inputsize_);
+    cudaMemcpy(d_reactCellIndex, reactCellIndex.data(), sizeof(int) * inputsize_, cudaMemcpyHostToDevice);
+#endif
+    // construct input
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (inputsize_ + threads_per_block - 1) / threads_per_block;
+    construct_init_input<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, dim_input_, d_T, p, 
+            d_reactCellIndex, y, y_input_BCT, init_input_);
+    normalize_input<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, dim_input_, init_input_, 
+            Xmu_, Xstd_, init_input_);
+
+    // inference by torch
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    double *d_output;
+    for (int sample_start = 0; sample_start < inputsize_; sample_start += batch_size_) {
+        int sample_end = std::min(sample_start + batch_size_, inputsize_);
+        int sample_len = sample_end - sample_start;
+        at::Tensor torch_input = torch::from_blob(init_input_ + sample_start * dim_input_, {sample_len, dim_input_}, 
+                torch::TensorOptions().device(device_).dtype(torch::kDouble));
+        // torch_input = torch_input.to(at::kFloat);
+        torch_input = torch_input.to(at::kHalf);
+        std::vector<torch::jit::IValue> INPUTS;
+        INPUTS.push_back(torch_input);
+        std::vector<at::Tensor> output(num_modules_);
+
+        for (int i = 0; i < num_modules_; ++i) {
+            output[i] = modules_[i].forward(INPUTS).toTensor();
+            output[i] = output[i].to(at::kDouble);
+            d_output = output[i].data_ptr<double>();
+            cudaMemcpy(NN_output_ + (i * inputsize_ + sample_start), d_output, sizeof(double) * sample_len, cudaMemcpyDeviceToDevice);
+        }
+    }
+    TICK_END_EVENT(Inference);
+
+    calculate_y_new<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_modules_, NN_output_, 
+            y_input_BCT, Ymu_, Ystd_, NN_output_);
+    calculate_RR<<<blocks_per_grid, threads_per_block, 0, stream>>>(inputsize_, num_cells_, num_species_, 1e-6, 
+            d_reactCellIndex, rho, y, p, NN_output_, RR);
+
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(init_input_, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(y_input_BCT, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(NN_output_, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_reactCellIndex, dataBase_.stream)); 
+#else
+    cudaFree(init_input_);
+    cudaFree(y_input_BCT);
+    cudaFree(NN_output_);
+    cudaFree(d_reactCellIndex);
+#endif
+
+}
+
+void dfChemistrySolver::sync() {
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
\ No newline at end of file
diff --git a/src_gpu/dfEEqn.H b/src_gpu/dfEEqn.H
index 61b2612cf..6fac22965 100644
--- a/src_gpu/dfEEqn.H
+++ b/src_gpu/dfEEqn.H
@@ -3,70 +3,108 @@
 #include "AmgXSolver.H"
 #include <amgx_c.h>
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include "dfThermo.H"
 
 class dfEEqn
 {
-private:
     dfMatrixDataBase &dataBase_;
+    dfThermo &thermo_;
+
+    // cuda resource
     cudaStream_t stream;
-    //cudaEvent_t event;
+#ifdef USE_GRAPH
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph_pre, graph_post;
+    cudaGraphExec_t graph_instance_pre, graph_instance_post;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
 
+	// constant values -- amgx solvers
     AmgXSolver *ESolver = nullptr;
-    int num_iteration;
-
-    // common variables
-    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
-    int num_boundary_faces, boundary_face_bytes;
-    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
-
-    // Matrix variables
-    double *d_A_csr, *d_b = nullptr;
-    double *h_A_csr, *h_b = nullptr;
-    double *d_he_old = nullptr;
-    double *h_he_new = nullptr;
-
-    // fields used by EEqn
-    double *d_alphaEff = nullptr;
-    double *d_K = nullptr;
-    double *d_K_old = nullptr;
+    int num_iteration = 0;
+
+    // constant fields - boundary
+	std::vector<int> patch_type_he;
+    std::vector<int> patch_type_k;
+
+    // non-constant fields - internal
+    // dpdt
     double *d_dpdt = nullptr;
-    double *d_boundary_K_init = nullptr;
-    double *d_boundary_K = nullptr;
-    double *d_boundary_alphaEff_init = nullptr;
-    double *d_boundary_alphaEff = nullptr;
-    double *d_value_internal_coeffs_init = nullptr;
-    double *d_value_boundary_coeffs_init = nullptr;
-    double *d_gradient_internal_coeffs_init = nullptr;
-    double *d_gradient_boundary_coeffs_init = nullptr;
+    double *h_dpdt = nullptr;
+    
+    // non-constant fields - boundary
+    // gradient
+    double *d_boundary_heGradient = nullptr;
+    double *h_boundary_heGradient = nullptr;
+    int num_gradientEnergy_boundary_surfaces = 0;
+    // boundary coeff fields
     double *d_value_internal_coeffs = nullptr;
-    double *d_value_boundary_coeffs = nullptr;
-    double *d_gradient_internal_coeffs = nullptr;
-    double *d_gradient_boundary_coeffs = nullptr;
-    double *d_boundary_gradient_init = nullptr;
-    double *d_boundary_gradient = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+
+    // non-constant fields - ldu
+    double *d_ldu = nullptr;
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+    double *d_extern = nullptr;
+	double *d_diag = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+
+    // non-constant fields - csr
+	double *d_A = nullptr;
+	double *d_b = nullptr; // TODO: needless
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
 public:
-    dfEEqn();
-    dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
-    ~dfEEqn();
+    // constructor
+    dfEEqn(dfMatrixDataBase &dataBase, dfThermo & thermo)
+        : dataBase_(dataBase), thermo_(thermo) {}
 
-    void prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-            const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient);
+	// destructor
+	~dfEEqn(){}
 
-    void initializeTimeStep();
+    // member function
 
-    void fvm_ddt();
-    void fvm_div();
-    void fvm_laplacian();
+    // getter function
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 
-    void fvc_ddt();
-    void fvc_div_phi_scalar();
-    void fvc_div_vector();
-    void add_to_source();
+	// initialization
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path); 
+	void setConstantFields(const std::vector<int> patch_type_he, const std::vector<int> patch_type_k);
+    void createNonConstantFieldsInternal();
+    void createNonConstantFieldsBoundary();
+    void createNonConstantLduAndCsrFields();
+    void initNonConstantFields(const double *he, const double *boundary_he);
 
-    void solve();
-    void checkValue(bool print);
-    void updatePsi(double *Psi);
+    void cleanCudaResources();
+
+    // run equation
+    // volScalarField diffAlphaD, volVectorField hDiffCorrFlux, 
+    void preProcessForYEqn();
+    void preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient);
+	void process();
+	void postProcess(double *h_he, double *h_boundary_he);
 
+    void eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, 
+            int num_boundary_surfaces, const int *face2Cells, double *T, double *p, double *y,
+            int num_patches, const int *patch_size, const int *patch_type,
+            const double *boundary_delta_coeffs, const double *boundary_p, const double* boundary_y, 
+            double *boundary_thermo_gradient);
+
+    void solve();
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
+    void compareHe(const double *he, const double *boundary_he, bool printFlag);
     void sync();
+
 };
diff --git a/src_gpu/dfEEqn.cu b/src_gpu/dfEEqn.cu
index d96bf0109..0c8892541 100644
--- a/src_gpu/dfEEqn.cu
+++ b/src_gpu/dfEEqn.cu
@@ -1,757 +1,363 @@
 #include "dfEEqn.H"
 
-// kernel functions
-
-__global__ void eeqn_fvm_ddt_kernel(int num_cells, const double rdelta_t,
-                                    const int *csr_row_index, const int *csr_diag_index,
-                                    const double *rho_old, const double *rho_new, const double *volume, const double *he_old,
-                                    const double sign, const double *A_csr_input, const double *b_input,
-                                    double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-    int csr_index = row_index + diag_index;
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    A_csr_output[csr_index] = A_csr_input[csr_index] + ddt_diag * sign;
-
-    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
-    b_output[index] = b_input[index] + ddt_part_term * he_old[index] * sign;
-}
-
-__global__ void eeqn_fvm_div_internal(int num_cells,
-                                      const int *csr_row_index, const int *csr_diag_index,
-                                      const double *weight, const double *phi,
-                                      const double sign, const double *A_csr_input, const double *b_input,
-                                      double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double div_diag = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[i] = A_csr_input[i] + (-w) * f * sign;
-            // lower neighbors contribute to sum of -1
-            div_diag += (w - 1) * f;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            // upper, index - 1, consider of diag
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[i] = A_csr_input[i] + (1 - w) * f * sign;
-            // upper neighbors contribute to sum of 1
-            div_diag += w * f;
-        }
+double* dfEEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
     }
-    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + div_diag * sign; // diag
-}
 
-__global__ void eeqn_fvm_div_boundary(int num_boundary_cells,
-                                      const int *csr_row_index, const int *csr_diag_index,
-                                      const int *boundary_cell_offset, const int *boundary_cell_id,
-                                      const double *value_internal_coeffs, const double *value_boundary_coeffs,
-                                      const double sign, const double *A_csr_input, const double *b_input,
-                                      double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_index = row_index + diag_index;
-
-    // construct internalCoeffs & boundaryCoeffs
-    double internal_coeffs = 0;
-    double boundary_coeffs = 0;
-    for (int i = 0; i < loop_size; i++)
-    {
-        internal_coeffs += value_internal_coeffs[cell_offset + i];
-        boundary_coeffs += value_boundary_coeffs[cell_offset + i];
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
     }
-    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
-    b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
-}
-
-__global__ void eeqn_fvm_laplacian_uncorrected_internal(int num_cells,
-                                                        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                                        const double *alphaEff, const double *weight,
-                                                        const double *magsf, const double *distance,
-                                                        const double sign, const double *A_csr_input, double *A_csr_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_alphaEff = alphaEff[index];
-    // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
-    // fvm.negSumDiag();
-    double sum_diag = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_alphaEff = alphaEff[neighbor_cell_id];
-        double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
-        sum_diag += (-coeff);
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_alphaEff = alphaEff[neighbor_cell_id];
-        double gamma = w * (nei_alphaEff - own_alphaEff) + own_alphaEff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[row_index + i] = A_csr_input[row_index + i] + coeff * sign;
-        sum_diag += (-coeff);
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
     }
-    A_csr_output[row_index + diag_index] = A_csr_input[row_index + diag_index] + sum_diag * sign; // diag
-}
-
-__global__ void eeqn_fvm_laplacian_uncorrected_boundary(int num_boundary_cells,
-                                                        const int *csr_row_index, const int *csr_diag_index,
-                                                        const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                        const double *boundary_alphaEff, const double *boundary_magsf,
-                                                        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                        const double sign, const double *A_csr_input, const double *b_input,
-                                                        double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_index = row_index + diag_index;
-
-    // OpenFoam code
-    // if (pvf.coupled())
-    // {
-    //     fvm.internalCoeffs()[patchi] =
-    //         pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
-    //     fvm.boundaryCoeffs()[patchi] =
-    //         -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
-    // }
-    // else
-    // {
-    //     fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
-    //     fvm.boundaryCoeffs()[patchi] = -
-    //         pGamma*pvf.gradientBoundaryCoeffs();
-    // }
-    double internal_coeffs = 0;
-    double boundary_coeffs = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double gamma = boundary_alphaEff[i];
-        double gamma_magsf = gamma * boundary_magsf[i];
-        internal_coeffs += gamma_magsf * gradient_internal_coeffs[i];
-        boundary_coeffs -= gamma_magsf * gradient_boundary_coeffs[i];
-    }
-
-    A_csr_output[csr_index] = A_csr_input[csr_index] + internal_coeffs * sign;
-    b_output[cell_index] = b_input[cell_index] + boundary_coeffs * sign;
-}
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
 
-__global__ void eeqn_fvc_ddt_kernel(int num_cells, const double rdelta_t,
-                                    const double *rho_old, const double *rho_new,
-                                    const double *K_old, const double *K,
-                                    const double *volume,
-                                    const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    double fvc_ddt_term = rdelta_t * (rho_new[index] * K[index] - rho_old[index] * K_old[index]) * volume[index];
-    b_output[index] = b_input[index] + fvc_ddt_term * sign;
-}
-
-__global__ void eeqn_fvc_div_vector_internal(int num_cells,
-                                             const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                             const double *sf, const double *vf, const double *tlambdas,
-                                             const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_vf_x = vf[index * 3 + 0];
-    double own_vf_y = vf[index * 3 + 1];
-    double own_vf_z = vf[index * 3 + 2];
-    double sum = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
-        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
-        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
-        sum -= sf_x * face_x + sf_y * face_y + sf_z * face_z;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
-        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
-        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
-        sum += sf_x * face_x + sf_y * face_y + sf_z * face_z;
-    }
-    b_output[index] = b_input[index] + sum * sign;
+    return pointer;
 }
 
-__global__ void eeqn_fvc_div_vector_boundary(int num_boundary_cells,
-                                             const int *boundary_cell_offset, const int *boundary_cell_id,
-                                             const double *boundary_sf, const double *boundary_vf,
-                                             const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // OpenFoam code
-    // Foam::surfaceInterpolationScheme<Type>::dotInterpolate
-    // if (vf.boundaryField()[pi].coupled())
-    // {
-    //     psf =
-    //         pSf
-    //         & (
-    //                 pLambda*vf.boundaryField()[pi].patchInternalField()
-    //                 + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField()
-    //           );
-    // }
-    // else
-    // {
-    //     psf = pSf & vf.boundaryField()[pi];
-    // }
-    // tmp<GeometricField<Type, fvPatchField, volMesh>> surfaceIntegrate
-    // forAll(mesh.boundary()[patchi], facei)
-    // {
-    //     ivf[pFaceCells[facei]] += pssf[facei];
-    // }
-    double sum = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sf_x = boundary_sf[i * 3 + 0];
-        double sf_y = boundary_sf[i * 3 + 1];
-        double sf_z = boundary_sf[i * 3 + 2];
-        double face_x = boundary_vf[i * 3 + 0];
-        double face_y = boundary_vf[i * 3 + 1];
-        double face_z = boundary_vf[i * 3 + 2];
-
-        // if not coupled
-        sum += (sf_x * face_x + sf_y * face_y + sf_z * face_z);
-    }
-    b_output[cell_index] = b_input[cell_index] + sum * sign;
+void dfEEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
+    this->mode_string = mode_string;
+    this->setting_path = setting_path;
+    ESolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
 }
 
-__global__ void eeqn_fvc_div_phi_scalar_internal(int num_cells,
-                                                 const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                                 const double *weight, const double *phi, const double *K,
-                                                 const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_cell_k = K[index];
-    double interp = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double p = phi[neighbor_index];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_k = K[neighbor_cell_id];
-            double face_k = (1 - w) * own_cell_k + w * neighbor_cell_k;
-            interp -= p * face_k;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double p = phi[neighbor_index];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_k = K[neighbor_cell_id];
-            double face_k = w * own_cell_k + (1 - w) * neighbor_cell_k;
-            interp += p * face_k;
+void dfEEqn::setConstantFields(const std::vector<int> patch_type_he, const std::vector<int> patch_type_k) {
+    this->patch_type_he = patch_type_he;
+    this->patch_type_k = patch_type_k;
+    // calculate num_gradientEnergy_boundary_surfaces
+    for (int i = 0; i < dataBase_.num_patches; i++) {
+        if (patch_type_he[i] == boundaryConditions::gradientEnergy) {
+            num_gradientEnergy_boundary_surfaces += dataBase_.patch_size[i];
         }
     }
-    b_output[index] = b_input[index] + interp * sign;
 }
 
-__global__ void eeqn_fvc_div_phi_scalar_boundary(int num_boundary_cells,
-                                                 const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                 const double *boundary_phi, const double *boundary_K,
-                                                 const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // boundary interplate
-    double interp = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        interp += boundary_phi[i] * boundary_K[i];
-    }
-
-    b_output[cell_index] = b_input[cell_index] + interp * sign;
+void dfEEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
+    // thermophysical fields
+    checkCudaErrors(cudaMalloc((void**)&d_dpdt, dataBase_.cell_value_bytes));
+    // boundary coeffs
+    checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+#endif
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_dpdt, dataBase_.cell_value_bytes));
+
+    // getter for h_dpdt
+    fieldPointerMap["h_dpdt"] = h_dpdt;
 }
 
-__global__ void eeqn_add_to_source_kernel(int num_cells,
-                                          const double sign_dpdt, const double *dpdt,
-                                          const double sign_diffAlphaD, const double *diffAlphaD,
-                                          const double *volume,
-                                          const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+void dfEEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces));
+#endif
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces));
 
-    b_output[index] = b_input[index] + sign_dpdt * dpdt[index] * volume[index] + sign_diffAlphaD * diffAlphaD[index] * volume[index];
+    // getter for h_boundary_heGradient
+    fieldPointerMap["h_boundary_heGradient"] = h_boundary_heGradient;
 }
 
-__global__ void eeqn_boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                         const double *boundary_K_init,
-                                        //  const double *boundary_alphaEff_init, 
-                                         const double *boundary_gradient_init,
-                                         double *boundary_K,
-                                        //  double *boundary_alphaEff,
-                                         double *boundary_gradient)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
-        return;
-
-    int p = bouPermedIndex[index];
 
-    boundary_K[index] = boundary_K_init[p];
-    // boundary_alphaEff[index] = boundary_alphaEff_init[p];
-    boundary_gradient[index] = boundary_gradient_init[p];
+void dfEEqn::createNonConstantLduAndCsrFields() {
+    checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
+    d_lower = d_ldu;
+    d_diag = d_ldu + dataBase_.num_surfaces;
+    d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
+    d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_bytes));
+#endif
 }
 
-__global__ void eeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi,
-                                                  double *gradient, const double *boundary_deltaCoeffs,
-                                                  double *internal_coeffs,
-                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
-                                                  double *laplac_boundary_coeffs)
+void dfEEqn::initNonConstantFields(const double *he, const double *boundary_he)
 {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
-        return;
-
-    double grad = gradient[index];
-    // energyGradient
-    double valueInternalCoeffs = 1.;
-    double valueBoundaryCoeffs = grad / boundary_deltaCoeffs[index];
-    double gradientInternalCoeffs = 0.;
-    double gradientBoundaryCoeffs = grad;
-
-    internal_coeffs[index] = boundary_phi[index] * valueInternalCoeffs;
-    boundary_coeffs[index] = -boundary_phi[index] * valueBoundaryCoeffs;
-    laplac_internal_coeffs[index] = gradientInternalCoeffs;
-    laplac_boundary_coeffs[index] = gradientBoundaryCoeffs;
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_he, he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_he, boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
 }
 
-// constructor
-dfEEqn::dfEEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
-    : dataBase_(dataBase)
-{
-    ESolver = new AmgXSolver(modeStr, cfgFile);
-
-    stream = dataBase_.stream;
-    // checkCudaErrors(cudaEventCreate(&event));
-
-    num_cells = dataBase_.num_cells;
-    cell_bytes = dataBase_.cell_bytes;
-    num_faces = dataBase_.num_faces;
-    cell_vec_bytes = dataBase_.cell_vec_bytes;
-    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
-    num_boundary_cells = dataBase_.num_boundary_cells;
-    num_surfaces = dataBase_.num_surfaces;
-    num_boundary_faces = dataBase_.num_boundary_faces;
-    boundary_face_bytes = dataBase_.boundary_face_bytes;
-
-    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
-    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
-    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
-
-    h_A_csr = new double[(num_cells + num_faces) * 3];
-    h_b = new double[num_cells * 3];
-    cudaMallocHost(&h_he_new, cell_bytes);
-
-    checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
-
-    checkCudaErrors(cudaMalloc((void **)&d_he_old, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_K, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_K_old, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_alphaEff, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_dpdt, cell_bytes));
-
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_K_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_K, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_alphaEff, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_gradient, boundary_face_bytes));
-
-    checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs_init, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_value_internal_coeffs, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_value_boundary_coeffs, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_gradient_internal_coeffs, boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_gradient_boundary_coeffs, boundary_face_bytes));
-}
-
-void dfEEqn::prepare_data(const double *he_old, const double *K, const double *K_old, const double *alphaEff,
-                          const double *dpdt, const double *boundary_K, const double *boundary_alphaEff, const double *boundary_gradient)
-{
-    // TODO not real async copy now, because some host array are not in pinned memory.
-
-    // copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(d_he_old, he_old, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_K, K, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_K_old, K_old, cell_bytes, cudaMemcpyHostToDevice, stream));
-    // checkCudaErrors(cudaMemcpyAsync(d_alphaEff, alphaEff, cell_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_dpdt, dpdt, cell_bytes, cudaMemcpyHostToDevice, stream));
-
-    // copy and permutate boundary variable
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_K_init, boundary_K, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    // checkCudaErrors(cudaMemcpyAsync(d_boundary_alphaEff_init, boundary_alphaEff, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(d_boundary_gradient_init, boundary_gradient, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-
-    // UnityLewis
-    d_alphaEff = dataBase_.d_alpha;
-    d_boundary_alphaEff = dataBase_.d_boundary_alpha;
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    eeqn_boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces, dataBase_.d_bouPermedIndex,
-                                                                                d_boundary_K_init, d_boundary_gradient_init,
-                                                                                d_boundary_K, d_boundary_gradient);
+void dfEEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
+        checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
+        checkCudaErrors(cudaGraphDestroy(graph_post));
+    }
+#endif
 }
 
-void dfEEqn::initializeTimeStep()
+void dfEEqn::preProcess(const double *h_he, const double *h_k, const double *h_k_old, const double *h_dpdt, const double *h_boundary_k, const double *h_boundary_heGradient)
 {
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
-    // initialize boundary coeffs
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    eeqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
-                                                                                         d_boundary_gradient, dataBase_.d_boundary_deltaCoeffs,
-                                                                                         d_value_internal_coeffs, d_value_boundary_coeffs,
-                                                                                         d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
 }
 
-void dfEEqn::fvm_ddt()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
-                                                                           d_A_csr_row_index, d_A_csr_diag_index,
-                                                                           dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_he_old,
-                                                                           1., d_A_csr, d_b, d_A_csr, d_b);
+void dfEEqn::process() {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+#ifdef STREAM_ALLOCATOR
+    // thermophysical fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_dpdt, dataBase_.cell_value_bytes, dataBase_.stream));
+    // fiv weight fields
+    //checkCudaErrors(cudaMallocAsync((void**)&d_phi_special_weight, dataBase_.cell_value_bytes, dataBase_.stream));
+    // boundary coeffs
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+ 
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_heGradient, sizeof(double) * num_gradientEnergy_boundary_surfaces, dataBase_.stream));
+
+    checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_b, dataBase_.cell_value_bytes, dataBase_.stream));
+#endif
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_k, dataBase_.h_k, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_k_old, dataBase_.h_k_old, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(d_dpdt, h_dpdt, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_k, dataBase_.h_boundary_k, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+    checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
+    checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+
+    eeqn_calculate_energy_gradient(thermo_, dataBase_.num_cells, dataBase_.num_species, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_boundary_face_cell, dataBase_.d_T, dataBase_.d_p, dataBase_.d_y,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
+            dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_p, dataBase_.d_boundary_y,
+            d_boundary_heGradient);
+    correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+            dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
+            patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
+            dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), 
+            dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
+            dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
+    update_boundary_coeffs_scalar(dataBase_.stream,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
+            dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_he, dataBase_.d_boundary_weight,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs, d_boundary_heGradient);
+    fvm_ddt_vol_scalar_vol_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, 
+            dataBase_.d_he, dataBase_.d_volume, d_diag, d_source);
+    // NOTE: fvm_div_scalar use d_phi_weight, which is computed in YEqn_GPU by compute_upwind_weight()
+    // thus we need open YEqn_GPU before UEqn_GPU
+    fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+            dataBase_.d_phi, dataBase_.d_phi_weight,
+            d_lower, d_upper, d_diag, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(),
+            dataBase_.d_boundary_phi,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_internal_coeffs, d_boundary_coeffs, 1.);
+    fvc_ddt_vol_scalar_vol_scalar(dataBase_.stream, dataBase_.num_cells,
+            dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_k,
+            dataBase_.d_k_old, dataBase_.d_volume, d_source, -1.);
+    fvc_div_surface_scalar_vol_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, 
+            dataBase_.d_k, dataBase_.d_phi, d_source, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_k.data(), 
+            dataBase_.d_boundary_face_cell, dataBase_.d_boundary_k, dataBase_.d_boundary_phi, -1);
+    fvm_laplacian_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+            dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, dataBase_.d_thermo_alpha, 
+            d_lower, d_upper, d_diag, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_thermo_alpha,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs, d_internal_coeffs, d_boundary_coeffs, -1);
+    fvc_div_cell_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_owner, dataBase_.d_neighbor, 
+            dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_hDiff_corr_flux, d_source,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_he.data(), dataBase_.d_boundary_face_cell,
+            dataBase_.d_boundary_weight, dataBase_.d_boundary_hDiff_corr_flux, dataBase_.d_boundary_sf, dataBase_.d_volume);
+    fvc_to_source_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, dataBase_.d_dpdt, d_source);
+    fvc_to_source_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, dataBase_.d_diff_alphaD, d_source, -1);
+#ifndef DEBUG_CHECK_LDU
+    ldu_to_csr_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type_he.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
+#endif
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
+#endif
+    TICK_END_EVENT(EEqn assembly);
+
+    TICK_START_EVENT;
+#ifndef DEBUG_CHECK_LDU
+    solve();
+#endif
+    TICK_END_EVENT(EEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+        TICK_START_EVENT;
+        correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
+                patch_type_he.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
+                dataBase_.d_he, dataBase_.d_boundary_he, dataBase_.cyclicNeighbor.data(), 
+                dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_T, dataBase_.d_boundary_y, d_boundary_heGradient, &thermo_);
+        TICK_END_EVENT(EEqn post process correctBC);
+
+        TICK_START_EVENT;
+        // copy he to host
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+        TICK_END_EVENT(EEqn post process copy back);
+
+        TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_dpdt, dataBase_.stream));
+        // fiv weight fieldsFree
+        //checkCudaErrors(cudaFreeAsync(d_phi_special_weight, dataBase_.stream));
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+     
+        checkCudaErrors(cudaFreeAsync(d_boundary_heGradient, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+#endif
+        TICK_END_EVENT(EEqn post process free);
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
+        post_graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
+#endif
+    sync();
 }
 
-void dfEEqn::fvm_div()
+void dfEEqn::eeqn_calculate_energy_gradient(dfThermo& GPUThermo, int num_cells, int num_species, 
+        int num_boundary_surfaces, const int *face2Cells, double *T, double *p, double *y,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_delta_coeffs, const double *boundary_p, const double* boundary_y, 
+        double *boundary_thermo_gradient)
 {
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                             d_A_csr_row_index, d_A_csr_diag_index,
-                                                                             dataBase_.d_weight, dataBase_.d_phi,
-                                                                             1., d_A_csr, d_b, d_A_csr, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                             d_A_csr_row_index, d_A_csr_diag_index,
-                                                                             dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                             d_value_internal_coeffs, d_value_boundary_coeffs,
-                                                                             1., d_A_csr, d_b, d_A_csr, d_b);
+    int bou_offset = 0, gradient_offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        if (patch_type[i] == boundaryConditions::gradientEnergy) {
+            GPUThermo.calculateEnergyGradient(patch_size[i], num_cells, num_species, num_boundary_surfaces, bou_offset, gradient_offset,
+                    face2Cells, T, p, y, boundary_delta_coeffs, boundary_p, boundary_y, boundary_thermo_gradient);
+            bou_offset += patch_size[i];
+            gradient_offset += patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            bou_offset += 2 * patch_size[i];
+        } else {
+            bou_offset += patch_size[i];
+        }
+    }
 }
 
-void dfEEqn::fvm_laplacian()
+// #if defined DEBUG_
+void dfEEqn::compareResult(const double *lower, const double *upper, const double *diag, 
+        const double *source, const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
 {
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvm_laplacian_uncorrected_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                               d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, d_alphaEff, dataBase_.d_weight,
-                                                                                               dataBase_.d_face, dataBase_.d_deltaCoeffs,
-                                                                                               -1., d_A_csr, d_A_csr);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvm_laplacian_uncorrected_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                               d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                               d_boundary_alphaEff, dataBase_.d_boundary_face, d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
-                                                                                               -1., d_A_csr, d_b, d_A_csr, d_b);
+    DEBUG_TRACE;
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower\n");
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper\n");
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag\n");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source;
+    h_source.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source\n");
+    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_internal_coeffs;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_internal_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_coeffs;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 }
 
-void dfEEqn::fvc_ddt()
+void dfEEqn::compareHe(const double *he, const double *boundary_he, bool printFlag)
 {
-    // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvc_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t,
-                                                                           dataBase_.d_rho_old, dataBase_.d_rho_new, d_K_old, d_K, dataBase_.d_volume,
-                                                                           -1., d_b, d_b);
-}
+    double *h_he = new double[dataBase_.num_cells];
+    double *h_boundary_he = new double[dataBase_.num_boundary_surfaces];
 
-void dfEEqn::fvc_div_vector()
-{
-    // " + fvc::div(hDiffCorrFlux)" is on the right side of "==", thus should add to source.
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvc_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                    d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                    dataBase_.d_face_vector, dataBase_.d_hDiffCorrFlux, dataBase_.d_weight,
-                                                                                    1., d_b, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvc_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                    dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                    dataBase_.d_boundary_face_vector, dataBase_.d_boundary_hDiffCorrFlux,
-                                                                                    1., d_b, d_b);
-}
+    checkCudaErrors(cudaMemcpy(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
 
-void dfEEqn::fvc_div_phi_scalar()
-{
-    // " + fvc::div(phi，K)" is on the left side of "==", thus should minus from source.
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvc_div_phi_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                        d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                        dataBase_.d_weight, dataBase_.d_phi, d_K,
-                                                                                        -1., d_b, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    eeqn_fvc_div_phi_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                        dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                        dataBase_.d_boundary_phi, d_boundary_K,
-                                                                                        -1., d_b, d_b);
+    fprintf(stderr, "check h_he\n");
+    checkVectorEqual(dataBase_.num_cells, he, h_he, 1e-14, printFlag);
+    fprintf(stderr, "check h_boundary_he\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_he, h_boundary_he, 1e-14, printFlag);
 }
+// #endif
 
-void dfEEqn::add_to_source()
+void dfEEqn::sync()
 {
-    // " - dpdt" is on the left side of "==", thus should add to source.
-    // "+ diffAlphaD" is on the left side of "==", thus should minus from source.
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    // " + fvc::ddt(rho，K)" is on the left side of "==", thus should minus from source.
-    eeqn_add_to_source_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                 // 1., d_dpdt, -1., d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
-                                                                                 1., d_dpdt, -1., dataBase_.d_diffAlphaD, dataBase_.d_volume, d_b, d_b);
-}
-
-void dfEEqn::checkValue(bool print)
-{
-    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, (num_faces + num_cells) * sizeof(double), cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, num_cells * sizeof(double), cudaMemcpyDeviceToHost, stream));
-
-    // Synchronize stream
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: %.16lf\n", i, h_A_csr[i]);
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b[%d]: %.16lf\n", i, h_b[i]);
-    }
-
-    char *input_file = "of_output_E.txt";
-    FILE *fp = fopen(input_file, "rb+");
-    if (fp == NULL)
-    {
-        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
-    }
-    int readfile = 0;
-    double *of_b = new double[num_cells];
-    double *of_A = new double[num_faces + num_cells];
-    readfile = fread(of_b, num_cells * sizeof(double), 1, fp);
-    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
-
-    std::vector<double> h_A_of_init_vec(num_cells + num_faces);
-    std::copy(of_A, of_A + num_cells + num_faces, h_A_of_init_vec.begin());
-
-    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
-    for (int i = 0; i < num_faces + num_cells; i++)
-    {
-        h_A_of_vec_1mtx[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
-    }
-
-    // b
-    std::vector<double> h_b_of_vec(num_cells);
-    std::copy(of_b, of_b + num_cells, h_b_of_vec.begin());
-
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            printf("h_A_of_vec_1mtx[%d]: %.16lf\n", i, h_A_of_vec_1mtx[i]);
-        for (int i = 0; i < num_cells; i++)
-            printf("h_b_of_vec[%d]: %.16lf\n", i, h_b_of_vec[i]);
-    }
-
-    // check
-    fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-6);
-    fprintf(stderr, "check of h_b\n");
-    checkVectorEqual(num_cells, h_b_of_vec.data(), h_b, 1e-6);
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
 }
 
 void dfEEqn::solve()
 {
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    // nvtxRangePush("solve");
-
-    int nNz = num_cells + num_faces; // matrix entries
-    if (num_iteration == 0)          // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        ESolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr);
-    }
-    else
-    {
-        ESolver->updateOperator(num_cells, nNz, d_A_csr);
-    }
-    ESolver->solve(num_cells, d_he_old, d_b);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_he, d_source);
     num_iteration++;
-
-    checkCudaErrors(cudaMemcpyAsync(h_he_new, d_he_old, cell_bytes, cudaMemcpyDeviceToHost, stream));
-    // checkCudaErrors(cudaEventRecord(event, stream));
-    //  checkCudaErrors(cudaStreamSynchronize(stream));
-    //  for (size_t i = 0; i < num_cells; i++)
-    //      fprintf(stderr, "h_he_after[%d]: %.16lf\n", i, h_he_new[i]);
 }
 
-void dfEEqn::sync()
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-}
-
-void dfEEqn::updatePsi(double *Psi)
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    // checkCudaErrors(cudaEventSynchronize(event));
-    memcpy(Psi, h_he_new, cell_bytes);
-}
-
-dfEEqn::~dfEEqn()
-{
-    delete h_A_csr;
-    delete h_b;
-
-    checkCudaErrors(cudaFreeHost(h_he_new));
-
-    checkCudaErrors(cudaFree(d_A_csr));
-    checkCudaErrors(cudaFree(d_b));
-
-    checkCudaErrors(cudaFree(d_he_old));
-    checkCudaErrors(cudaFree(d_K));
-    checkCudaErrors(cudaFree(d_K_old));
-    // checkCudaErrors(cudaFree(d_alphaEff));
-    checkCudaErrors(cudaFree(d_dpdt));
-
-    checkCudaErrors(cudaFree(d_boundary_K_init));
-    checkCudaErrors(cudaFree(d_boundary_K));
-    checkCudaErrors(cudaFree(d_boundary_alphaEff_init));
-    checkCudaErrors(cudaFree(d_boundary_alphaEff));
-    checkCudaErrors(cudaFree(d_boundary_gradient_init));
-    checkCudaErrors(cudaFree(d_boundary_gradient));
-
-    checkCudaErrors(cudaFree(d_value_internal_coeffs_init));
-    checkCudaErrors(cudaFree(d_value_boundary_coeffs_init));
-    checkCudaErrors(cudaFree(d_gradient_internal_coeffs_init));
-    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs_init));
-    checkCudaErrors(cudaFree(d_value_internal_coeffs));
-    checkCudaErrors(cudaFree(d_value_boundary_coeffs));
-    checkCudaErrors(cudaFree(d_gradient_internal_coeffs));
-    checkCudaErrors(cudaFree(d_gradient_boundary_coeffs));
-
-    // checkCudaErrors(cudaEventDestroy(event));
-}
+void dfEEqn::postProcess(double *h_he, double *h_boundary_he) {}
diff --git a/src_gpu/dfMatrixDataBase.H b/src_gpu/dfMatrixDataBase.H
index 8efb4bf62..13d6fd39f 100644
--- a/src_gpu/dfMatrixDataBase.H
+++ b/src_gpu/dfMatrixDataBase.H
@@ -4,6 +4,8 @@
 #include <unistd.h>
 #include "cuda_profiler_api.h"
 #include <cuda_runtime.h>
+#include "nccl.h"
+#include "mpi.h"
 #include "nvtx3/nvToolsExt.h"
 #include <vector>
 #include <numeric>
@@ -12,7 +14,24 @@
 #include <iostream>
 #include <ctime>
 #include <cmath>
+#include <unordered_map>
 
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+
+//#define DEBUG_
+//#define DEBUG_CHECK_LDU
+
+extern int myRank;
+
+#define GPU_DEBUG_
+#ifdef GPU_DEBUG_
+   #define DEBUG_TRACE fprintf(stderr, "myRank[%d] %s %d\n", myRank, __FILE__, __LINE__);
+#else
+   #define DEBUG_TRACE
+#endif
+
+const double SMALL = std::numeric_limits<double>::epsilon();
 
 static const char *_cudaGetErrorEnum(cudaError_t error) {
   return cudaGetErrorName(error);
@@ -30,612 +49,277 @@ void check(T result, char const *const func, const char *const file,
 
 #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
 
-inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error) {
-    for (size_t i = 0; i < count; ++i)
+inline void checkVectorEqual(int count, const double* basevec, double* vec, double max_relative_error, bool print = false) {
+    for (int i = 0; i < count; ++i)
     {
         double abs_diff = fabs(basevec[i] - vec[i]);
         double rel_diff = fabs(basevec[i] - vec[i]) / fabs(basevec[i]);
+        if (print && myRank == 0)
+            fprintf(stderr, "myRank[%d] index %d, cpu data: %.24lf, gpu data: %.24lf, relative error: %.24lf\n", myRank, i, basevec[i
+    ], vec[i], rel_diff);
         // if (abs_diff > 1e-12 && rel_diff > max_relative_error && !std::isinf(rel_diff))
-        if (abs_diff > 1e-15 && rel_diff > max_relative_error)
-            fprintf(stderr, "mismatch index %d, cpu data: %.16lf, gpu data: %.16lf, relative error: %.16lf\n", i, basevec[i], vec[i], rel_diff);
-    }
+        if (abs_diff > 1e-15 && rel_diff > max_relative_error && myRank == 0)
+            fprintf(stderr, "myRank[%d] mismatch index %d, cpu data: %.30lf, gpu data: %.30lf, relative error: %.16lf\n", myRank, i, basevec[i], vec[i], rel_diff);
+    }   
 }
 
+enum AMGXSetting {
+    u_setting,
+    p_setting
+};
+
+enum location {
+    cpu,
+    gpu
+};
+
+enum position {
+    internal,
+    boundary
+};
+
 enum boundaryConditions{
     zeroGradient,
     fixedValue,
     coupled,
-    empty
+    empty,
+    gradientEnergy,
+    calculated,
+    cyclic,
+    processor,
+    extrapolated,
+    fixedEnergy,
+    processorCyclic
 };
 
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr, const int patchSize);
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr);
 
 struct dfMatrixDataBase
 {
-    // - cuda resource
+    // cuda resource
     cudaStream_t stream;
 
-    // - number of cell size
-    int num_cells;
-    // - number of face size
-    int num_surfaces;
-    // - number of offdiagnal entry size (2*num_surfaces)
-    int num_faces;
-    // - number of boundary cells
-    int num_boundary_cells;
-    // - number of boundary faces
-    int num_boundary_faces;
-
-    int num_species;
-
-    // - mesh variables
-    // - csr_row_index
-    int *h_A_csr_row_index=nullptr, *d_A_csr_row_index=nullptr;
-    // - csr_col_index
-    int *h_A_csr_col_index=nullptr, *d_A_csr_col_index=nullptr;
-    // - csr_diag_index
-    int *h_A_csr_diag_index=nullptr, *d_A_csr_diag_index=nullptr;
-
-    // - the pre-permutated and post-permutated interpolation weight list
-    std::vector<double> h_weight_vec_init, h_weight_vec;
-    // - the pre-permutated and post-permutated flux (phi) list
-    std::vector<double> h_phi_vec_init, h_phi_vec;
-    // - the pre-permutated and post-permutated cell face vector list
-    std::vector<double> h_face_vector_vec_init, h_face_vector_vec;
-    std::vector<double> h_face_vec_init, h_face_vec;
-    std::vector<double> h_deltaCoeffs_vec_init, h_deltaCoeffs_vec;
-    // - the host pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *h_rho_new = nullptr, *h_rho_old = nullptr, *h_velocity_old = nullptr, 
-    *h_pressure = nullptr;
-    const double *h_volume = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated interpolation weight list
-    double *h_weight_init = nullptr, *h_weight = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated flux (phi) list
-    double *h_phi_init = nullptr, *h_phi = nullptr;
-    // - the host pointer to the pre-permutated and post-permutated cell face vector list
-    double *h_face_vector_init = nullptr, *h_face_vector = nullptr;
-    double *h_face_init = nullptr, *h_face = nullptr;
-    double *h_deltaCoeffs_init = nullptr, *h_deltaCoeffs = nullptr;
-    // - the device pointer to rho_new, rho_old, velocity_old, pressure and volume list
-    double *d_rho_new = nullptr, *d_rho_old = nullptr, *d_velocity_old = nullptr, 
-    *d_pressure = nullptr, *d_volume = nullptr;
-    // - the device pointer to Y(vector Yi)
-    //std::vector<double*> d_Y;
-    double *d_Y = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated interpolation weight list
-    double *d_weight_init = nullptr, *d_weight = nullptr;
-    double *d_weight_upwind = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated flux (phi) list
-    double *d_phi_init = nullptr, *d_phi = nullptr;
-    // - the device pointer to the pre-permutated and post-permutated cell face vector list
-    double *d_face_vector_init = nullptr, *d_face_vector = nullptr;
-    double *d_face_init = nullptr, *d_face = nullptr;
-    double *d_deltaCoeffs_init = nullptr, *d_deltaCoeffs = nullptr;
-    std::vector<double*> d_rhoD_vector;
-
-    double *d_hDiffCorrFlux = nullptr;
-    double *d_diffAlphaD = nullptr;
-    double *d_rhoD = nullptr;
-    double *d_alpha = nullptr;
-
-    double rdelta_t = 1/1e-6;
-
-    /**
-     * @brief boundary related variables
-     */
-    int *h_boundary_cell_offset = nullptr, *d_boundary_cell_offset=nullptr;
-    int *h_boundary_cell_id = nullptr, *d_boundary_cell_id = nullptr;
-    double *h_internal_coeffs = nullptr, *h_boundary_coeffs = nullptr,
-    *h_boundary_pressure = nullptr, *h_boundary_face_vector = nullptr,
-    *h_boundary_face = nullptr, *d_boundary_face = nullptr,
-    *h_boundary_deltaCoeffs = nullptr, *d_boundary_deltaCoeffs = nullptr, 
-    *d_internal_coeffs = nullptr, *d_boundary_coeffs = nullptr,
-    *d_internal_coeffs_init = nullptr, *d_boundary_coeffs_init = nullptr,
-    *d_laplac_internal_coeffs = nullptr, *d_laplac_boundary_coeffs = nullptr,
-    *d_laplac_internal_coeffs_init = nullptr, *d_laplac_boundary_coeffs_init = nullptr,
-    *d_boundary_pressure = nullptr, *d_boundary_face_vector = nullptr,
-    *d_boundary_pressure_init = nullptr,
-    *d_boundary_phi = nullptr, *d_boundary_phi_init = nullptr, 
-    *d_boundary_velocity = nullptr, *d_boundary_velocity_init = nullptr,
-    *d_boundary_nuEff = nullptr, *d_boundary_nuEff_init = nullptr,
-    *d_boundary_rho = nullptr, *d_boundary_rho_init = nullptr;
-    std::vector<double*> d_boundary_Y_vector;
-    std::vector<double*> d_boundary_Y_init_vector;
-    std::vector<double*> d_internal_coeffs_Y_vector;
-    std::vector<double*> d_boundary_coeffs_Y_vector;
-    std::vector<double*> d_laplac_internal_coeffs_Y_vector;
-    std::vector<double*> d_laplac_boundary_coeffs_Y_vector;
-    double *d_internal_coeffs_Y = nullptr;
-    double *d_boundary_coeffs_Y = nullptr;
-    double *d_laplac_internal_coeffs_Y = nullptr;
-    double *d_laplac_boundary_coeffs_Y = nullptr;
-    std::vector<double*> d_boundary_rhoD_vector;
-    double *d_boundary_mut_sct = nullptr;
-    double *d_boundary_rhoD = nullptr;
-    double *d_boundary_alpha = nullptr;
-
-    double *d_boundary_hDiffCorrFlux = nullptr;
-    int *d_boundary_UpatchType = nullptr;
-    int *d_boundary_YpatchType = nullptr;
-
-    std::vector<int> boundPermutationList;
-    std::vector<double> ueqn_internalCoeffs, ueqn_boundaryCoeffs;
-    std::vector<double> boundary_face_vector;
-    std::vector<double> boundary_pressure;
-    std::vector<double> boundary_face;
-    std::vector<double> boundary_deltaCoeffs;
-    std::vector<std::vector<int>> patch_type_init;
-    std::vector<std::vector<int>> patch_type;
-
-    // - the device pointer to the permutated index list
-    std::vector<int> permedIndex;
-    int *d_permedIndex=nullptr;
-    int *d_bouPermedIndex = nullptr;
-
-
-    // bytesize
-    // - bytes of diagnal entries
-    size_t cell_bytes;
-    // - bytes of diagnal entries (vector)
-    size_t cell_vec_bytes;
-    // - bytes of diagnal index
-    size_t cell_index_bytes;
-     // - bytes of diagnal index
-    size_t face_bytes;
-    size_t face_vec_bytes;
-    size_t face_index_bytes;
-
-    size_t boundary_cell_bytes;
-    size_t boundary_cell_vec_bytes;
-    size_t boundary_cell_index_bytes;
-
-    size_t boundary_face_bytes;
-    size_t boundary_face_vec_bytes;
-    size_t boundary_face_index_bytes;
-
-    // A_csr has one more element in each row: itself
-    size_t csr_row_index_bytes;
-    size_t csr_col_index_bytes;
-    size_t csr_value_bytes;
-    size_t csr_value_vec_bytes;
-
-    // extra matrix information
-    double *d_turbSrc_A = nullptr, *d_turbSrc_b = nullptr, *d_turbSrc_A_init = nullptr;
-    std::vector<double> h_turbSrc_init_mtx_vec, h_turbSrc_init_1mtx;
-    std::vector<double> h_turbSrc_init_src_vec, h_turbSrc_src_vec;
-    std::vector<int> tmpPermutatedList;
-    int * d_tmpPermutatedList = nullptr;
-
-    // double *h_A_csr = nullptr, *h_b = nullptr, *h_psi = nullptr;
-    // double *d_A_csr = nullptr, *d_b = nullptr, *d_psi = nullptr;
-
-    int num_iteration;
-
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
-
-    double* d_grad = nullptr; 
-    double* d_grad_boundary = nullptr, *d_grad_boundary_init = nullptr;
-    double* d_nuEff = nullptr;
+    // communication info
+    MPI_Comm mpi_comm;
+    ncclComm_t nccl_comm;
+    ncclUniqueId nccl_id;
+    int nRanks;
+    int myRank;
+    int localRank;
+    std::vector<int> neighbProcNo;
+
+    // cyclic info
+    std::vector<int> cyclicNeighbor;
+    std::vector<int> patchSizeOffset;
+
+    // constant values -- basic
+    int num_cells = 0;
+    int num_total_cells = 0;
+    int num_surfaces = 0;
+    int num_boundary_surfaces = 0;
+    int num_proc_surfaces = 0;
+    int num_Nz = 0;
+    int num_patches = 0;
+    int num_species = 0;
+    std::vector<int> patch_size;
+    std::vector<int> patch_type_calculated;
+    std::vector<int> patch_type_extropolated;
+    std::vector<int> patch_type_T;
+    double rdelta_t = 0;
+    std::vector<int> lduCSRIndex;
+
+    // constant values -- ldu bytesize
+    size_t cell_value_bytes = 0;
+    size_t cell_value_vec_bytes = 0;
+    size_t cell_value_tsr_bytes = 0;
+    size_t cell_index_bytes = 0;
+    size_t surface_value_bytes = 0;
+    size_t surface_index_bytes = 0;
+    size_t surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_bytes = 0;
+    size_t boundary_surface_value_vec_bytes = 0;
+    size_t boundary_surface_value_tsr_bytes = 0;
+    size_t boundary_surface_index_bytes = 0;
+
+    // constant values -- csr bytesize
+    size_t csr_row_index_bytes = 0;
+    size_t csr_col_index_bytes = 0;
+    size_t csr_value_bytes = 0;
+    size_t csr_value_vec_bytes = 0;
+
+    // constant indexes
+    int *d_owner = nullptr;
+    int *d_neighbor = nullptr;
+    int *d_diag_to_csr_index= nullptr;
+    int *d_ldu_to_csr_index = nullptr;
+    int *d_csr_row_index= nullptr;
+    int *d_csr_col_index= nullptr;
+
+    // amgx solvers
+	AmgXSolver *u_setting_solver = nullptr;
+	AmgXSolver *p_setting_solver = nullptr;
+
+    // constant fields - internal
+    double *d_sf = nullptr;
+    double *d_mesh_dis = nullptr;
+    double *d_mag_sf = nullptr;
+    double *d_weight = nullptr;
+    double *d_phi_weight = nullptr; // weight for mvConvection->fvmDiv
+    double *d_delta_coeffs = nullptr;
+    double *d_volume = nullptr;
+    
+    double *h_sf = nullptr;
+    double *h_mesh_dis = nullptr;
+
+    // constant fields - boundary
+    double *d_boundary_sf = nullptr;
+    double *d_boundary_mag_sf = nullptr;
+    double *d_boundary_weight = nullptr;
+    double *d_boundary_delta_coeffs = nullptr;
+    int *d_boundary_face_cell = nullptr;
+
+    double *h_boundary_sf = nullptr;
+
+    // non-constant fields - internal 
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_rho = nullptr;
+    double *d_u = nullptr;
+    double *d_y = nullptr;
+    double *d_he = nullptr;
+    double *d_p = nullptr;
+    double *d_k = nullptr;
+    // fields solved by eqns - old 
+    // TODO: not all fields need to store oldTime
+    double *d_rho_old = nullptr;
+    double *d_k_old = nullptr;
+    double *d_u_old = nullptr;
+    double *d_p_old = nullptr;
+    double *d_u_old_host_order = nullptr; // tmp
+    //double *d_y_old = nullptr;
+    //double *d_he_old = nullptr;
+    //double *d_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_phi = nullptr;
+    double *d_phi_old = nullptr;
+    // other shared fields between eqns - thermophysical
+    double *d_thermo_psi = nullptr;
+    double *d_hDiff_corr_flux = nullptr;
+    double *d_diff_alphaD = nullptr;
+    double *d_dpdt = nullptr;
+
+    double *d_T = nullptr;
+    double *h_T = nullptr;
+    double *d_mu = nullptr;
+    double *d_thermo_alpha = nullptr;
+    double *d_thermo_rhoD = nullptr;
+
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_rho = nullptr;
+    double *h_rho_old = nullptr;
+    double *h_u= nullptr;
+    double *h_u_old= nullptr;
+    double *h_y= nullptr;
+    double *h_he= nullptr;
+    double *h_k= nullptr;
+    double *h_k_old = nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_p= nullptr;
+    double *h_p_old = nullptr;
+    double *h_phi= nullptr;
+    // internal fields used between eqns
+    double *d_rAU = nullptr;
+    double *d_HbyA = nullptr;
+    // turbulence fields
+    double *d_turbulence_k = nullptr;
+    double *d_turbulence_epsilon = nullptr;
+
+    // non-constant fields - boundary
+    // TODO: further estimate
+    // fields solved by eqns - new
+    double *d_boundary_rho = nullptr;
+    double *d_boundary_u = nullptr;
+    double *d_boundary_y = nullptr;
+    double *d_boundary_he = nullptr;
+    double *d_boundary_p = nullptr;
+    double *d_boundary_p_old = nullptr;
+    double *d_boundary_k = nullptr;
+    // fields solved by eqns - old
+    double *d_boundary_rho_old = nullptr;
+    double *d_boundary_u_old = nullptr;
+    double *d_boundary_u_old_host_order = nullptr; // tmp
+    //double *d_boundary_y_old = nullptr;
+    //double *d_boundary_he_old = nullptr;
+    //double *d_boundary_p_old = nullptr;
+    // other shared fields between eqns
+    double *d_boundary_phi = nullptr;
+    double *d_boundary_phi_old = nullptr;
+    // other shared fields between eqns - thermophysical
+    double *d_boundary_thermo_psi = nullptr;
+    double *d_boundary_hDiff_corr_flux = nullptr;
+    double *d_boundary_diff_alphaD = nullptr;
+
+    double *d_boundary_T = nullptr;
+    double *d_boundary_mu = nullptr;
+    double *d_boundary_thermo_alpha = nullptr;
+    double *d_boundary_thermo_rhoD = nullptr;
+    // boundary fields used between eqns
+    double *d_boundary_rAU = nullptr;
+    double *d_boundary_HbyA = nullptr;
+    // computed on GPU, used on CPU, need memcpyd2h - host
+    double *h_boundary_rho = nullptr;
+    double *h_boundary_rho_old = nullptr;
+    double *h_boundary_u= nullptr;
+    double *h_boundary_u_old= nullptr;
+    double *h_boundary_y= nullptr;
+    double *h_boundary_he= nullptr;
+    double *h_boundary_k = nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d - host
+    double *h_boundary_p= nullptr;
+    double *h_boundary_p_old= nullptr;
+    double *h_boundary_phi= nullptr;
+    double *h_boundary_phi_old= nullptr;
+
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
     // constructor
     dfMatrixDataBase();
-    dfMatrixDataBase(int num_surfaces, int num_cells, int num_boundary_faces, int num_species, int & num_boundary_cells_output,
-        const int *neighbour, const int *owner, const double* volume, const double* weight, const double* face_vector, const double* face, 
-        const double* deltaCoeffs, std::vector<double> boundary_face_vector_init, std::vector<double> boundary_face_init, 
-        std::vector<double> boundary_deltaCoeffs_init, std::vector<int> boundary_cell_id_init, std::vector<std::vector<int>> patch_type_init)
-    : num_cells(num_cells), num_faces(num_surfaces*2), num_surfaces(num_surfaces), num_species(num_species), num_iteration(0),
-      num_boundary_faces(num_boundary_faces), h_volume(volume), patch_type_init(patch_type_init)
-    {
-        // create cuda stream
-        checkCudaErrors(cudaStreamCreate(&stream));
-
-        // allocate field pointer in pin memory
-        cudaMallocHost(&h_phi_init, num_faces * sizeof(double));
-        cudaMallocHost(&h_rho_old, num_cells * sizeof(double));
-
-        h_weight_vec_init.resize(num_faces);
-        h_weight_vec.resize(num_faces);
-        h_face_vector_vec_init.resize(num_faces*3);
-        h_face_vector_vec.resize(num_faces*3);
-        h_face_vec_init.resize(num_faces);
-        h_face_vec.resize(num_faces);
-        h_deltaCoeffs_vec_init.resize(num_faces);
-        h_deltaCoeffs_vec.resize(num_faces);
-        h_turbSrc_init_mtx_vec.resize(num_faces + num_cells);
-        h_turbSrc_init_1mtx.resize(num_faces + num_cells);
-        h_turbSrc_init_src_vec.resize(3*num_cells);
-        h_turbSrc_src_vec.resize(3*num_cells);
-
-        // byte sizes
-        cell_bytes = num_cells * sizeof(double);
-        cell_vec_bytes = num_cells * 3 * sizeof(double);
-        cell_index_bytes = num_cells * sizeof(int);
-
-        face_bytes = num_faces * sizeof(double);
-        face_vec_bytes = num_faces * 3 * sizeof(double);
-        face_index_bytes = num_faces * sizeof(int);
-
-        // A_csr has one more element in each row: itself
-        csr_row_index_bytes = (num_cells + 1) * sizeof(int);
-        csr_col_index_bytes = (num_cells + num_faces) * sizeof(int);
-        csr_value_bytes = (num_cells + num_faces) * sizeof(double);
-        csr_value_vec_bytes = (num_cells + num_faces) * 3 * sizeof(double);
-
-        /************************construct mesh variables****************************/
-        /**
-         * 1. h_csr_row_index & h_csr_diag_index
-        */
-        std::vector<int> h_mtxEntry_perRow_vec(num_cells);
-        std::vector<int> h_csr_diag_index_vec(num_cells);
-        std::vector<int> h_csr_row_index_vec(num_cells + 1, 0);
-
-        for (int faceI = 0; faceI < num_surfaces; faceI++)
-        {
-            h_csr_diag_index_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[neighbour[faceI]]++;
-            h_mtxEntry_perRow_vec[owner[faceI]]++;
-        }
-
-        // - consider diagnal element in each row
-        std::transform(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_mtxEntry_perRow_vec.begin(), [](int n)
-            {return n + 1;});
-        // - construct h_csr_row_index & h_csr_diag_index
-        std::partial_sum(h_mtxEntry_perRow_vec.begin(), h_mtxEntry_perRow_vec.end(), h_csr_row_index_vec.begin()+1);
-        // - assign h_csr_row_index & h_csr_diag_index
-        h_A_csr_row_index = h_csr_row_index_vec.data();
-        h_A_csr_diag_index = h_csr_diag_index_vec.data();
-
-        /**
-         * 2. h_csr_col_index
-        */
-        std::vector<int> rowIndex(num_faces + num_cells), colIndex(num_faces + num_cells), diagIndex(num_cells);
-        std::iota(diagIndex.begin(), diagIndex.end(), 0);
-
-        // initialize the RowIndex (rowIndex of lower + upper + diagnal)
-        std::copy(neighbour, neighbour + num_surfaces, rowIndex.begin());
-        std::copy(owner, owner + num_surfaces, rowIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), rowIndex.begin() + num_faces);
-        // initialize the ColIndex (colIndex of lower + upper + diagnal)
-        std::copy(owner, owner + num_surfaces, colIndex.begin());
-        std::copy(neighbour, neighbour + num_surfaces, colIndex.begin() + num_surfaces);
-        std::copy(diagIndex.begin(), diagIndex.end(), colIndex.begin() + num_faces);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> rowColPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            rowColPair.insert(std::make_pair(rowIndex[i], colIndex[i]));
-        }
-        // - sort
-        std::vector<std::pair<int, int>> globalPerm(rowColPair.begin(), rowColPair.end());
-        std::sort(globalPerm.begin(), globalPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-
-        std::vector<int> h_csr_col_index_vec;
-        std::transform(globalPerm.begin(), globalPerm.end(), std::back_inserter(h_csr_col_index_vec), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-        h_A_csr_col_index = h_csr_col_index_vec.data();
-        
-        // construct a tmp permutated List for add fvMatrix
-        std::vector<int> tmp_permutation(2*num_surfaces + num_cells);
-        std::vector<int> tmp_rowIndex(2*num_surfaces + num_cells);
-        std::iota(tmp_permutation.begin(), tmp_permutation.end(), 0);
-        std::copy(neighbour, neighbour + num_surfaces, tmp_rowIndex.begin());
-        std::copy(diagIndex.begin(), diagIndex.end(), tmp_rowIndex.begin() + num_surfaces);
-        std::copy(owner, owner + num_surfaces, tmp_rowIndex.begin() + num_surfaces + num_cells);
-        std::multimap<int,int> tmpPair;
-        for (int i = 0; i < 2*num_surfaces+num_cells; i++)
-        {
-            tmpPair.insert(std::make_pair(tmp_rowIndex[i], tmp_permutation[i]));
-        }
-        std::vector<std::pair<int, int>> tmpPerm(tmpPair.begin(), tmpPair.end());
-        std::sort(tmpPerm.begin(), tmpPerm.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-        if (pair1.first != pair2.first) {
-            return pair1.first < pair2.first;
-        } else {
-            return pair1.second < pair2.second;
-        }
-        });
-        std::transform(tmpPerm.begin(), tmpPerm.end(), std::back_inserter(tmpPermutatedList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        /**
-         * 3. boundary imformations
-        */
-        // get boundPermutation and offset lists
-        std::vector<int> boundPermutationListInit(num_boundary_faces);
-        std::vector<int> boundOffsetList;
-        std::iota(boundPermutationListInit.begin(), boundPermutationListInit.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> boundPermutation;
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundPermutation.insert(std::make_pair(boundary_cell_id_init[i], boundPermutationListInit[i]));
-        }
-
-        // - sort 
-        std::vector<std::pair<int, int>> boundPermPair(boundPermutation.begin(), boundPermutation.end());
-        std::sort(boundPermPair.begin(), boundPermPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-
-        // - construct boundPermedIndex and boundary_cell_id
-        std::vector<int> boundary_cell_id;
-        boundPermutationList.clear();
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundary_cell_id), []
-            (const std::pair<int, int>& pair) {
-            return pair.first;
-        });
-        std::transform(boundPermPair.begin(), boundPermPair.end(), std::back_inserter(boundPermutationList), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // construct boundary_cell_offset
-        std::map<int, int> countMap;
-        std::vector<int> boundaryCellcount;
-        for (const auto& cellIndex : boundary_cell_id)
-            ++ countMap[cellIndex];
-        for (const auto& [cellIndex, count] : countMap)
-            boundaryCellcount.push_back(count);
-
-        num_boundary_cells = boundaryCellcount.size();
-        num_boundary_cells_output = num_boundary_cells;
-
-        std::vector<int> boundary_cell_offset(boundaryCellcount.size() + 1, 0);
-        std::partial_sum(boundaryCellcount.begin(), boundaryCellcount.end(), boundary_cell_offset.begin()+1);
-        
-        // assign h_boundary_cell_offset & h_boundary_cell_id
-        h_boundary_cell_offset = boundary_cell_offset.data();
-        h_boundary_cell_id = boundary_cell_id.data();
-
-        // 
-        boundary_cell_bytes = num_boundary_cells * sizeof(double);
-        boundary_cell_vec_bytes = num_boundary_cells * 3 * sizeof(double);
-        boundary_cell_index_bytes = num_boundary_cells * sizeof(int);
-
-        boundary_face_bytes = num_boundary_faces * sizeof(double);
-        boundary_face_vec_bytes = num_boundary_faces * 3 * sizeof(double);
-        boundary_face_index_bytes = num_boundary_faces * sizeof(int);
-
-        ueqn_internalCoeffs.resize(3*num_boundary_faces);
-        ueqn_boundaryCoeffs.resize(3*num_boundary_faces);
-
-        boundary_face_vector.resize(3*num_boundary_faces);
-        boundary_pressure.resize(num_boundary_faces);
-        boundary_face.resize(num_boundary_faces);
-        boundary_deltaCoeffs.resize(num_boundary_faces);
-
-        patch_type.resize(2);
-        patch_type[0].resize(num_boundary_faces);
-        patch_type[1].resize(num_boundary_faces);
-
-        /**
-         * 4. permutation list for field variables
-        */
-        std::vector<int> offdiagRowIndex(2*num_surfaces), permIndex(2*num_surfaces);
-        // - initialize the offdiagRowIndex (rowIndex of lower + rowIndex of upper)
-        std::copy(neighbour, neighbour + num_surfaces, offdiagRowIndex.begin());
-        std::copy(owner, owner + num_surfaces, offdiagRowIndex.begin() + num_surfaces);
-
-        // - initialize the permIndex (0, 1, ..., 2*num_surfaces)
-        std::iota(permIndex.begin(), permIndex.end(), 0);
-
-        // - construct hashTable for sorting
-        std::multimap<int,int> permutation;
-        for (int i = 0; i < 2*num_surfaces; i++)
-        {
-            permutation.insert(std::make_pair(offdiagRowIndex[i], permIndex[i]));
-        }
-        // - sort 
-        std::vector<std::pair<int, int>> permPair(permutation.begin(), permutation.end());
-        std::sort(permPair.begin(), permPair.end(), []
-        (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
-            if (pair1.first != pair2.first) {
-                return pair1.first < pair2.first;
-            } else {
-                return pair1.second < pair2.second;
-            }
-        });
-        // - form permedIndex list
-        std::transform(permPair.begin(), permPair.end(), std::back_inserter(permedIndex), []
-            (const std::pair<int, int>& pair) {
-            return pair.second;
-        });
-
-        // copy and permutate cell variables
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin());
-        std::copy(weight, weight + num_surfaces, h_weight_vec_init.begin() + num_surfaces);
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin());
-        std::copy(face_vector, face_vector + 3*num_surfaces, h_face_vector_vec_init.begin() + 3*num_surfaces);
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin());
-        std::copy(face, face + num_surfaces, h_face_vec_init.begin() + num_surfaces);
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin());
-        std::copy(deltaCoeffs, deltaCoeffs + num_surfaces, h_deltaCoeffs_vec_init.begin() + num_surfaces);
-        for (int i = 0; i < num_faces; i++)
-        {
-            h_weight_vec[i] = h_weight_vec_init[permedIndex[i]];
-            h_face_vec[i] = h_face_vec_init[permedIndex[i]];
-            h_deltaCoeffs_vec[i] = h_deltaCoeffs_vec_init[permedIndex[i]];
-            h_face_vector_vec[i*3] = h_face_vector_vec_init[3*permedIndex[i]];
-            h_face_vector_vec[i*3+1] = h_face_vector_vec_init[3*permedIndex[i]+1];
-            h_face_vector_vec[i*3+2] = h_face_vector_vec_init[3*permedIndex[i]+2];
-        }
-        h_weight = h_weight_vec.data();
-        h_face_vector = h_face_vector_vec.data();
-        h_face = h_face_vec.data();
-        h_deltaCoeffs = h_deltaCoeffs_vec.data();
-
-        for (int i = 0; i < num_boundary_faces; i++)
-        {
-            boundary_face_vector[3*i] = boundary_face_vector_init[3*boundPermutationList[i]];
-            boundary_face_vector[3*i+1] = boundary_face_vector_init[3*boundPermutationList[i]+1];
-            boundary_face_vector[3*i+2] = boundary_face_vector_init[3*boundPermutationList[i]+2];
-            boundary_face[i] = boundary_face_init[boundPermutationList[i]];
-            boundary_deltaCoeffs[i] = boundary_deltaCoeffs_init[boundPermutationList[i]];
-            patch_type[0][i] = patch_type_init[0][boundPermutationList[i]];
-            patch_type[1][i] = patch_type_init[1][boundPermutationList[i]];
-        }
-        h_boundary_face_vector = boundary_face_vector.data();
-        h_boundary_face = boundary_face.data();
-        h_boundary_deltaCoeffs = boundary_deltaCoeffs.data();
-
-        /************************allocate memory on device****************************/
-        int total_bytes = 0;
-
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_row_index, csr_row_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_col_index, csr_col_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_A_csr_diag_index, cell_index_bytes));
-        total_bytes += (csr_row_index_bytes + csr_col_index_bytes + cell_index_bytes);
-
-        //d_Y.resize(num_species);
-        d_rhoD_vector.resize(num_species);
-        d_boundary_Y_vector.resize(num_species);
-        d_boundary_Y_init_vector.resize(num_species);
-        d_internal_coeffs_Y_vector.resize(num_species);
-        d_boundary_coeffs_Y_vector.resize(num_species);
-        d_laplac_internal_coeffs_Y_vector.resize(num_species);
-        d_laplac_boundary_coeffs_Y_vector.resize(num_species);
-        d_boundary_rhoD_vector.resize(num_species);
-
-        for (size_t i = 0; i < num_species; ++i){
-            //checkCudaErrors(cudaMalloc((void**)&d_Y[i], cell_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_rhoD_vector[i], cell_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_Y, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rho_new, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_volume, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_pressure, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_velocity_old, cell_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_weight_upwind, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_deltaCoeffs, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_phi_init, face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_face_vector, face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_nuEff, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_rhoD, cell_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_alpha, cell_bytes));
-        total_bytes += (cell_bytes * (5 + 2*num_species) + face_bytes * 6 + cell_vec_bytes + face_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int)));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_cell_id, boundary_face_index_bytes));
-        total_bytes += (boundary_face_index_bytes + (num_boundary_cells+1) * sizeof(int));
-
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_pressure, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_velocity, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face_vector, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_face, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_deltaCoeffs, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_init, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs, boundary_face_vec_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_nuEff_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_init, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, boundary_face_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_UpatchType, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_YpatchType, boundary_face_index_bytes));
-        for (size_t i = 0; i < num_species; ++i){
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_Y_init_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y_vector[i], boundary_face_bytes));
-            checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD_vector[i], boundary_face_bytes));
-        }
-        checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_internal_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_laplac_boundary_coeffs_Y, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_rhoD, boundary_face_bytes * num_species));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_alpha, boundary_face_bytes));
-        
-        total_bytes += (boundary_face_bytes*10 + boundary_face_vec_bytes * 11);
-
-        // checkCudaErrors(cudaMalloc((void**)&d_A_csr, csr_value_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_b, cell_vec_bytes));
-        // checkCudaErrors(cudaMalloc((void**)&d_psi, cell_vec_bytes));
-        total_bytes += (boundary_face_bytes + boundary_face_vec_bytes * 3);
-
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_A_init, csr_value_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_turbSrc_b, cell_vec_bytes));
-        total_bytes += (2*csr_value_bytes + cell_vec_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_permedIndex, face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_bouPermedIndex, boundary_face_index_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_tmpPermutatedList, csr_col_index_bytes));
-        total_bytes += (face_index_bytes + boundary_face_index_bytes + csr_col_index_bytes);
-
-        checkCudaErrors(cudaMalloc((void**)&d_grad, num_cells * 9 * sizeof(double)));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary, boundary_face_bytes * 9));
-        checkCudaErrors(cudaMalloc((void**)&d_grad_boundary_init, boundary_cell_bytes * 9));
-        total_bytes += (num_cells * 9 * sizeof(double) + boundary_face_bytes * 9 + boundary_cell_bytes * 9); // FIXME: rename
-
-        checkCudaErrors(cudaMalloc((void**)&d_hDiffCorrFlux, 3 * cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_diffAlphaD, cell_bytes));
-        checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiffCorrFlux, 3 * boundary_face_bytes));
-
-        fprintf(stderr, "Total bytes malloc on GPU: %.2fMB\n", total_bytes * 1.0 / 1024 / 1024);
-
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_row_index, h_A_csr_row_index, csr_row_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_col_index, h_A_csr_col_index, csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_A_csr_diag_index, h_A_csr_diag_index, cell_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_offset, h_boundary_cell_offset, (num_boundary_cells+1) * sizeof(int), cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_cell_id, h_boundary_cell_id, boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-
-        checkCudaErrors(cudaMemcpyAsync(d_volume, h_volume, cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_weight, h_weight, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face, h_face, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_deltaCoeffs, h_deltaCoeffs, face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_face_vector, h_face_vector, face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face_vector, h_boundary_face_vector, boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_face, h_boundary_face, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_deltaCoeffs, h_boundary_deltaCoeffs, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_UpatchType, patch_type[0].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_YpatchType, patch_type[1].data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-
-        checkCudaErrors(cudaMemcpyAsync(d_permedIndex, permedIndex.data(), face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_bouPermedIndex, boundPermutationList.data(), boundary_face_index_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_tmpPermutatedList, tmpPermutatedList.data(), csr_col_index_bytes, cudaMemcpyHostToDevice, stream));
-    };
-
-    ~dfMatrixDataBase(){
-        std::cout << "Destructor called." << std::endl;
-        // TODO: free pointers
-        
-    };
+
+    // deconstructor
+    ~dfMatrixDataBase();
+
+    // member function
+    void prepareCudaResources();
+    void cleanCudaResources();
+    void setCommInfo(MPI_Comm mpi_comm, ncclComm_t nccl_comm, ncclUniqueId nccl_id,
+            int nRanks, int myRank, int localRank, std::vector<int> &neighbProcNo);
+    void setConstantValues(int num_cells, int num_total_cells, int num_surfaces, 
+        int num_boundary_surfaces, int num_patches, int num_proc_surfaces, 
+        std::vector<int> patch_size, int num_species, double rdelta_t);             
+    void setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, 
+            const int *procCols, int globalOffset);
+    void setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path);
+    void resetAmgxSolvers();
+    void solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b);
+    void setCyclicInfo(std::vector<int> &cyclicNeighbor); // when use cyclic boundary
+
+    void createConstantFieldsInternal();
+    void createConstantFieldsBoundary();
+    void initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+            const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance);
+    void initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+            const double *boundary_delta_coeffs, const double *boundary_weight, const int *boundary_face_cell, 
+            std::vector<int> &patch_type_calculated, std::vector<int> &patch_type_extropolated);
+
+    void createNonConstantFieldsInternal();
+    void createNonConstantFieldsBoundary();
+    void initNonConstantFieldsInternal();
+    void initNonConstantFieldsBoundary();
+
+    void preTimeStep();
+    void postTimeStep();
+
+    // getter
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 };
 
diff --git a/src_gpu/dfMatrixDataBase.cu b/src_gpu/dfMatrixDataBase.cu
index d4f5a7ab0..24db4b698 100644
--- a/src_gpu/dfMatrixDataBase.cu
+++ b/src_gpu/dfMatrixDataBase.cu
@@ -1,8 +1,8 @@
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include "dfNcclBase.H"
 
-
-void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::string& patchTypeStr,
-    const int patchSize)
+void constructBoundarySelectorPerPatch(int *patchTypeSelector, const std::string& patchTypeStr)
 {
     boundaryConditions patchCondition;
     std::vector<int> tmpSelector;
@@ -10,7 +10,14 @@ void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::s
         {"zeroGradient", zeroGradient},
         {"fixedValue", fixedValue},
         {"empty", empty},
-        {"coupled", coupled}
+        {"gradientEnergy", gradientEnergy},
+        {"calculated", calculated},
+        {"coupled", coupled},
+        {"cyclic", cyclic},
+        {"processor", processor},
+        {"extrapolated", extrapolated},
+        {"fixedEnergy", fixedEnergy},
+        {"processorCyclic", processorCyclic}
     };
     auto iter = BCMap.find(patchTypeStr);
     if (iter != BCMap.end()) {
@@ -22,27 +29,511 @@ void constructBoundarySelector(std::vector<int>& patchTypeSelector, const std::s
     switch (patchCondition){
         case zeroGradient:
         {
-            tmpSelector.resize(patchSize, 0);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 0;
             break;
         }
         case fixedValue:
         {
-            tmpSelector.resize(patchSize, 1);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 1;
+            break;
+        }
+        case coupled:
+        {
+            *patchTypeSelector = 2;
             break;
         }
         case empty:
         {
-            tmpSelector.resize(patchSize, 2);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 3;
             break;
         }
-        case coupled:
+        case gradientEnergy:
+        {
+            *patchTypeSelector = 4;
+            break;
+        }
+        case calculated:
         {
-            tmpSelector.resize(patchSize, 3);
-            patchTypeSelector.insert(patchTypeSelector.end(), tmpSelector.begin(), tmpSelector.end());
+            *patchTypeSelector = 5;
+            break;
+        }
+        case cyclic:
+        {
+            *patchTypeSelector = 6;
+            break;
+        }
+        case processor:
+        {
+            *patchTypeSelector = 7;
+            break;
+        }
+        case extrapolated:
+        {
+            *patchTypeSelector = 8;
+            break;
+        }
+        case fixedEnergy:
+        {
+            *patchTypeSelector = 9;
+            break;
+        }
+        case processorCyclic:
+        {
+            *patchTypeSelector = 10;
             break;
         }
     }
 }
+
+dfMatrixDataBase::dfMatrixDataBase() {}
+
+dfMatrixDataBase::~dfMatrixDataBase() {}
+
+void dfMatrixDataBase::setCommInfo(MPI_Comm mpi_comm, ncclComm_t nccl_comm, ncclUniqueId nccl_id,
+        int nRanks, int myRank, int localRank, std::vector<int> &neighbProcNo) {
+    this->mpi_comm = mpi_comm;
+    this->nccl_comm = nccl_comm;
+    this->nccl_id = nccl_id;
+    this->nRanks = nRanks;
+    this->myRank = myRank;
+    this->localRank = localRank;
+    this->neighbProcNo = neighbProcNo;
+}
+ 
+void dfMatrixDataBase::prepareCudaResources() {
+    checkCudaErrors(cudaStreamCreate(&stream));
+}
+
+void dfMatrixDataBase::cleanCudaResources() {
+    // destroy cuda resources
+    checkCudaErrors(cudaStreamDestroy(stream));
+    //ncclDestroy(nccl_comm);
+    // TODO: free pointers
+}
+
+void dfMatrixDataBase::setConstantValues(int num_cells, int num_total_cells, int num_surfaces, 
+        int num_boundary_surfaces, int num_patches, int num_proc_surfaces, 
+        std::vector<int> patch_size, int num_species, double rdelta_t) {
+    // constant values -- basic
+    this->num_cells = num_cells;
+    this->num_total_cells = num_total_cells;
+    this->num_surfaces = num_surfaces;
+    this->num_boundary_surfaces = num_boundary_surfaces;
+    this->num_patches = num_patches;
+    this->num_proc_surfaces = num_proc_surfaces;
+    this->patch_size = patch_size;
+    this->num_species = num_species;
+    this->rdelta_t = rdelta_t;
+    this->num_Nz = num_cells + 2 * num_surfaces + num_proc_surfaces;
+
+    // constant values -- ldu bytesize
+    cell_value_bytes = num_cells * sizeof(double);
+    cell_value_vec_bytes = num_cells * 3 * sizeof(double);
+    cell_value_tsr_bytes = num_cells * 9 * sizeof(double);
+    cell_index_bytes = num_cells * sizeof(int);
+    surface_value_bytes = num_surfaces * sizeof(double);
+    surface_index_bytes = num_surfaces * sizeof(int);
+    surface_value_vec_bytes = num_surfaces * 3 * sizeof(double);
+    boundary_surface_value_bytes = num_boundary_surfaces * sizeof(double);
+    boundary_surface_value_vec_bytes = num_boundary_surfaces * 3 * sizeof(double);
+    boundary_surface_value_tsr_bytes = num_boundary_surfaces * 9 * sizeof(double);
+    boundary_surface_index_bytes = num_boundary_surfaces * sizeof(int);
+
+    // constant values -- csr bytesize
+    csr_row_index_bytes = (num_cells + 1) * sizeof(int);
+    csr_col_index_bytes = num_Nz * sizeof(int);
+    csr_value_bytes = num_Nz * sizeof(double);
+    csr_value_vec_bytes = num_Nz * 3 * sizeof(double);
+}
+
+void dfMatrixDataBase::setAmgxSolvers(const std::string &mode_string, const std::string &u_setting_path, const std::string &p_setting_path) {
+    // amgx solvers
+    u_setting_solver = new AmgXSolver(mode_string, u_setting_path, localRank);
+    p_setting_solver = new AmgXSolver(mode_string, p_setting_path, localRank);
+}
+
+void dfMatrixDataBase::resetAmgxSolvers() {
+    if (u_setting_solver) {
+        delete u_setting_solver;
+        u_setting_solver = nullptr;
+    }
+    if (p_setting_solver) {
+        delete p_setting_solver;
+        p_setting_solver = nullptr;
+    }
+}
+    
+void dfMatrixDataBase::solve(int num_iteration, AMGXSetting setting, double *d_A, double *d_x, double *d_b) {
+    AmgXSolver *solver = (setting == AMGXSetting::u_setting) ? u_setting_solver : p_setting_solver;
+    if (num_iteration == 0)                                     // first interation
+    {
+        solver->setOperator(num_cells, num_total_cells, num_Nz, d_csr_row_index, d_csr_col_index, d_A);
+    }
+    else
+    {
+        solver->updateOperator(num_cells, num_Nz, d_A);
+    }
+    solver->solve(num_cells, d_x, d_b);
+}
+
+void dfMatrixDataBase::setCyclicInfo(std::vector<int> &cyclicNeighbor)
+{
+    this->cyclicNeighbor = cyclicNeighbor;
+}
+
+void dfMatrixDataBase::setConstantIndexes(const int *owner, const int *neighbor, const int *procRows, 
+        const int *procCols, int globalOffset) {
+    // build d_owner, d_neighbor
+    checkCudaErrors(cudaMalloc((void**)&d_owner, surface_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_neighbor, surface_index_bytes));
+    checkCudaErrors(cudaMemcpyAsync(d_owner, owner, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_neighbor, neighbor, surface_index_bytes, cudaMemcpyHostToDevice, stream));
+    DEBUG_TRACE;
+
+    // build permTmp, rowIndicesTmp, colIndicesTmp
+    std::vector<int> permTmp(num_Nz);
+    std::iota(permTmp.begin(), permTmp.end(), 0);
+
+    // rowIndex of: low, diag, upp, proc
+    std::vector<int> rowIndicesTmp(num_Nz);
+    std::copy(neighbor, neighbor + num_surfaces, rowIndicesTmp.begin()); // row index of lower entry
+    std::iota(rowIndicesTmp.begin() + num_surfaces, rowIndicesTmp.begin() + num_cells + num_surfaces, 0); // row index of diag entry
+    std::copy(owner, owner + num_surfaces, rowIndicesTmp.begin() + num_cells + num_surfaces); // row index of upper entry
+    std::copy(procRows, procRows + num_proc_surfaces, rowIndicesTmp.begin() + num_cells + 2 * num_surfaces); // row index of proc entry
+
+    // colIndex of: low, diag, upp, proc
+    std::vector<int> colIndicesTmp(num_Nz);
+    std::copy(owner, owner + num_surfaces, colIndicesTmp.begin()); // col index of lower entry
+    std::iota(colIndicesTmp.begin() + num_surfaces, colIndicesTmp.begin() + num_cells + num_surfaces, 0); // col index of diag entry
+    std::copy(neighbor, neighbor + num_surfaces, colIndicesTmp.begin() + num_cells + num_surfaces); // col index of upper entry
+    std::copy(procCols, procCols + num_proc_surfaces, colIndicesTmp.begin() + num_cells + 2 * num_surfaces); // col index of proc entry
+
+    // premute rowIndicesTmp, get CSRRowIndex and ldu2csrPerm
+    std::multimap<int,int> rowIndicesPermutation;
+    for (int i = 0; i < num_Nz; ++i){
+        rowIndicesPermutation.insert(std::make_pair(rowIndicesTmp[i], permTmp[i]));
+    }
+    std::vector<std::pair<int, int>> rowIndicesPermPair(rowIndicesPermutation.begin(), rowIndicesPermutation.end());
+    
+    std::sort(rowIndicesPermPair.begin(), rowIndicesPermPair.end(), []
+    (const std::pair<int, int>& pair1, const std::pair<int, int>& pair2){
+        if (pair1.first != pair2.first) {
+            return pair1.first < pair2.first;
+        } else {
+            return pair1.second < pair2.second;
+        }
+    });
+    std::vector<int> permRowIndex;
+    std::transform(rowIndicesPermPair.begin(), rowIndicesPermPair.end(), std::back_inserter(permRowIndex), []
+        (const std::pair<int, int>& pair) {
+        return pair.first;
+    });
+    std::vector<int> CSRRowIndex(num_cells + 1, 0);
+    for (int i = 0; i < num_Nz; i++) {
+        CSRRowIndex[permRowIndex[i] + 1]++;
+    }
+    std::partial_sum(CSRRowIndex.begin(), CSRRowIndex.end(), CSRRowIndex.begin());
+
+    std::transform(rowIndicesPermPair.begin(), rowIndicesPermPair.end(), std::back_inserter(lduCSRIndex), []
+        (const std::pair<int, int>& pair) {
+        return pair.second;
+    });
+
+    // get diagCSRIndex
+    std::vector<int> diagCSRIndex(num_cells);
+    int startIndex = 0;
+    for (int i = 0; i < num_cells; i++) {
+        int diagIndex = i + num_surfaces; // index of diag entry in permTmp
+        for (int j = startIndex; j < num_Nz; j++) {
+            if (lduCSRIndex[j] == diagIndex) {
+                diagCSRIndex[i] = j;
+                startIndex = j + 1;
+                break;
+            }
+        }
+    }
+
+    // get CSRColIndex
+    // localToGlobalColIndices: add globalOffset to colIndicesTmp
+    std::transform(colIndicesTmp.begin(), colIndicesTmp.begin() + num_cells + 2 * num_surfaces, colIndicesTmp.begin(), 
+        [globalOffset](int i){return i + globalOffset;});
+    
+    // permute colIndicesTmp
+    std::vector<int> CSRColIndex(num_Nz);
+    for (int i = 0; i < num_Nz; ++i){
+        CSRColIndex[i] = colIndicesTmp[lduCSRIndex[i]];
+    }
+
+    checkCudaErrors(cudaMalloc((void**)&d_ldu_to_csr_index, csr_col_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diag_to_csr_index, cell_index_bytes));
+    checkCudaErrors(cudaMemcpy(d_ldu_to_csr_index, lduCSRIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_diag_to_csr_index, diagCSRIndex.data(), cell_index_bytes, cudaMemcpyHostToDevice));
+
+    // build d_csr_row_index, d_csr_col_index
+    checkCudaErrors(cudaMalloc((void**)&d_csr_row_index, csr_row_index_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_csr_col_index, csr_col_index_bytes));
+    checkCudaErrors(cudaMemcpy(d_csr_row_index, CSRRowIndex.data(), csr_row_index_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_csr_col_index, CSRColIndex.data(), csr_col_index_bytes, cudaMemcpyHostToDevice));
+}
+
+void dfMatrixDataBase::createConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mesh_dis, surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mag_sf, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_weight, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_phi_weight, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_delta_coeffs, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_volume, cell_value_bytes));
+    fieldPointerMap["d_sf"] = d_sf;
+    fieldPointerMap["d_mesh_dis"] = d_mesh_dis;
+    fieldPointerMap["d_mag_sf"] = d_mag_sf;
+    fieldPointerMap["d_weight"] = d_weight;
+    fieldPointerMap["d_phi_weight"] = d_phi_weight;
+    fieldPointerMap["d_delta_coeffs"] = d_delta_coeffs;
+    fieldPointerMap["d_volume"] = d_volume;
+
+    checkCudaErrors(cudaMallocHost((void**)&h_sf, surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_mesh_dis, surface_value_vec_bytes));
+    fieldPointerMap["h_sf"] = h_sf;
+    fieldPointerMap["h_mesh_dis"] = h_mesh_dis;
+}
+
+void dfMatrixDataBase::createConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_sf, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mag_sf, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_delta_coeffs, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_weight, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_face_cell, boundary_surface_index_bytes));
+    fieldPointerMap["d_boundary_sf"] = d_boundary_sf;
+    fieldPointerMap["d_boundary_mag_sf"] = d_boundary_mag_sf;
+    fieldPointerMap["d_boundary_delta_coeffs"] = d_boundary_delta_coeffs;
+    fieldPointerMap["d_boundary_weight"] = d_boundary_weight;
+
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_sf, boundary_surface_value_vec_bytes));
+    fieldPointerMap["h_boundary_sf"] = h_boundary_sf;
+}
+
+void dfMatrixDataBase::initConstantFieldsInternal(const double *sf, const double *mag_sf, 
+        const double *weight, const double *delta_coeffs, const double *volume, const double *mesh_distance) {
+    // permute sf
+    for (int i = 0; i < num_surfaces; i++) {
+        h_sf[num_surfaces * 0 + i] = sf[i * 3 + 0];
+        h_sf[num_surfaces * 1 + i] = sf[i * 3 + 1];
+        h_sf[num_surfaces * 2 + i] = sf[i * 3 + 2];
+        h_mesh_dis[num_surfaces * 0 + i] = mesh_distance[i * 3 + 0];
+        h_mesh_dis[num_surfaces * 1 + i] = mesh_distance[i * 3 + 1];
+        h_mesh_dis[num_surfaces * 2 + i] = mesh_distance[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpyAsync(d_sf, h_sf, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mesh_dis, h_mesh_dis, surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_mag_sf, mag_sf, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_weight, weight, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_delta_coeffs, delta_coeffs, surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_volume, volume, cell_value_bytes, cudaMemcpyHostToDevice, stream));
+}
+
+void dfMatrixDataBase::initConstantFieldsBoundary(const double *boundary_sf, const double *boundary_mag_sf, 
+        const double *boundary_delta_coeffs, const double *boundary_weight, const int *boundary_face_cell, std::vector<int>& patch_type_calculated,
+        std::vector<int>& patch_type_extropolated) {
+    this->patch_type_calculated = patch_type_calculated;
+    this->patch_type_extropolated = patch_type_extropolated;
+    // permute bouSf
+    for (int i = 0; i < num_boundary_surfaces; i++) {
+        h_boundary_sf[num_boundary_surfaces * 0 + i] = boundary_sf[i * 3 + 0];
+        h_boundary_sf[num_boundary_surfaces * 1 + i] = boundary_sf[i * 3 + 1];
+        h_boundary_sf[num_boundary_surfaces * 2 + i] = boundary_sf[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_sf, h_boundary_sf, boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_mag_sf, boundary_mag_sf, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_delta_coeffs, boundary_delta_coeffs, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_weight, boundary_weight, boundary_surface_value_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_face_cell, boundary_face_cell, boundary_surface_index_bytes, cudaMemcpyHostToDevice, stream));  
+}
+
+void dfMatrixDataBase::createNonConstantFieldsInternal() {
+    checkCudaErrors(cudaMalloc((void**)&d_rho, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_u_old_host_order, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_he, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_p, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_k, cell_value_bytes));
+    fieldPointerMap["d_rho"] = d_rho;
+    fieldPointerMap["d_u"] = d_u;
+    fieldPointerMap["d_u_old"] = d_u_old;
+    fieldPointerMap["d_y"] = d_y;
+    fieldPointerMap["d_he"] = d_he;
+    fieldPointerMap["d_p"] = d_p;
+    fieldPointerMap["d_k"] = d_k;
+    
+    checkCudaErrors(cudaMalloc((void**)&d_rho_old, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_k_old, cell_value_bytes));
+    fieldPointerMap["d_rho_old"] = d_rho_old;
+    fieldPointerMap["d_k_old"] = d_k_old;
+    // checkCudaErrors(cudaMalloc((void**)&d_u_old, cell_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_y_old, cell_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_he_old, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_p_old, cell_value_bytes));
+    fieldPointerMap["d_p_old"] = d_p_old;
+    
+    checkCudaErrors(cudaMalloc((void**)&d_phi, surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_phi_old, surface_value_bytes));
+    fieldPointerMap["d_phi"] = d_phi;
+    fieldPointerMap["d_phi_old"] = d_phi_old;
+
+    // thermophysical fields
+    checkCudaErrors(cudaMalloc((void**)&d_thermo_psi, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_thermo_alpha, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_thermo_rhoD, num_species * cell_value_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_hDiff_corr_flux, cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diff_alphaD, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_dpdt, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_T, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_mu, cell_value_bytes));
+
+    // turbulence fields
+    checkCudaErrors(cudaMalloc((void**)&d_turbulence_k, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_turbulence_epsilon, cell_value_bytes));
+
+    // internal fields used between eqns
+    checkCudaErrors(cudaMalloc((void**)&d_rAU, cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_HbyA, cell_value_vec_bytes));
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_T, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_rho, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_rho_old, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_u, cell_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_u_old, cell_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_y, cell_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_he, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_k, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_k_old, cell_value_bytes));
+    fieldPointerMap["h_T"] = h_T;
+    fieldPointerMap["h_rho"] = h_rho;
+    fieldPointerMap["h_rho_old"] = h_rho_old;
+    fieldPointerMap["h_u"] = h_u;
+    fieldPointerMap["h_u_old"] = h_u_old;
+    fieldPointerMap["h_y"] = h_y;
+    fieldPointerMap["h_he"] = h_he;
+    fieldPointerMap["h_k"] = h_k;
+    fieldPointerMap["h_k_old"] = h_k_old;
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_p, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_p_old, cell_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_phi, surface_value_bytes));
+    fieldPointerMap["h_p"] = h_p;
+    fieldPointerMap["h_p_old"] = h_p_old;
+    fieldPointerMap["h_phi"] = h_phi;
+}
+
+void dfMatrixDataBase::createNonConstantFieldsBoundary() {
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old_host_order, boundary_surface_value_vec_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_he, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_p, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_k, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho"] = d_boundary_rho;
+    fieldPointerMap["d_boundary_u"] = d_boundary_u;
+    fieldPointerMap["d_boundary_u_old"] = d_boundary_u_old;
+    fieldPointerMap["d_boundary_y"] = d_boundary_y;
+    fieldPointerMap["d_boundary_he"] = d_boundary_he;
+    fieldPointerMap["d_boundary_p"] = d_boundary_p;
+    fieldPointerMap["d_boundary_k"] = d_boundary_k;
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_old, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_rho_old"] = d_boundary_rho_old;
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_u_old, boundary_surface_value_vec_bytes));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_y_old, boundary_surface_value_bytes * num_species));
+    // checkCudaErrors(cudaMalloc((void**)&d_boundary_he_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_p_old, boundary_surface_value_bytes));
+
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phi, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phi_old, boundary_surface_value_bytes));
+    fieldPointerMap["d_boundary_phi"] = d_boundary_phi;
+    fieldPointerMap["d_boundary_phi_old"] = d_boundary_phi_old;
+
+    // thermophysical fields
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_psi, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_alpha, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_thermo_rhoD, num_species * boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_hDiff_corr_flux, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_diff_alphaD, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_T, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mu, boundary_surface_value_bytes));
+
+    // internal fields used between eqns
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rAU, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_HbyA, boundary_surface_value_vec_bytes));
+
+    // computed on GPU, used on CPU, need memcpyd2h
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rho_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_u, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_u_old, boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_y, boundary_surface_value_bytes * num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_he, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_k, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_rho"] = h_boundary_rho;
+    fieldPointerMap["h_boundary_rho_old"] = h_boundary_rho_old;
+    fieldPointerMap["h_boundary_u"] = h_boundary_u;
+    fieldPointerMap["h_boundary_u_old"] = h_boundary_u_old;
+    fieldPointerMap["h_boundary_y"] = h_boundary_y;
+    fieldPointerMap["h_boundary_he"] = h_boundary_he;
+    fieldPointerMap["h_boundary_k"] = h_boundary_k;
+
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_p, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_p_old, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi, boundary_surface_value_bytes));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_phi_old, boundary_surface_value_bytes));
+    fieldPointerMap["h_boundary_p"] = h_boundary_p;
+    fieldPointerMap["h_boundary_p_old"] = h_boundary_p_old;
+    fieldPointerMap["h_boundary_phi"] = h_boundary_phi;
+    fieldPointerMap["h_boundary_phi_old"] = h_boundary_phi_old;
+}
+
+void dfMatrixDataBase::preTimeStep() {
+    checkCudaErrors(cudaMemcpyAsync(d_rho_old, d_rho, cell_value_bytes, cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_rho_old, d_boundary_rho, boundary_surface_value_bytes, cudaMemcpyDeviceToDevice, stream));
+    
+    checkCudaErrors(cudaMemcpyAsync(d_phi_old, d_phi, surface_value_bytes, cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_phi_old, d_boundary_phi, boundary_surface_value_bytes, cudaMemcpyDeviceToDevice, stream));
+
+    checkCudaErrors(cudaMemcpyAsync(d_u_old, d_u, cell_value_vec_bytes, cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_u_old, d_boundary_u, boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, stream));
+    
+    checkCudaErrors(cudaMemcpyAsync(d_k_old, d_k, cell_value_bytes, cudaMemcpyDeviceToDevice, stream));
+
+    checkCudaErrors(cudaMemcpyAsync(d_p_old, d_p, cell_value_bytes, cudaMemcpyDeviceToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_p_old, d_boundary_p, boundary_surface_value_bytes, cudaMemcpyDeviceToDevice, stream));
+}
+
+void dfMatrixDataBase::postTimeStep() {}
+
+double* dfMatrixDataBase::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
+
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}
diff --git a/src_gpu/dfMatrixOpBase.H b/src_gpu/dfMatrixOpBase.H
new file mode 100644
index 000000000..c27090303
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.H
@@ -0,0 +1,343 @@
+#pragma once
+#include <cuda_runtime.h>
+#include <nccl.h>
+#include "dfThermo.H"
+class dfThermo;
+
+// macros need to be in a certain order
+
+// sequence 0: STREAM_ALLOCATOR can be open or not
+#ifndef STREAM_ALLOCATOR
+    #define STREAM_ALLOCATOR
+#endif
+
+// sequence 0: USE_GRAPH can be open or not
+#ifndef USE_GRAPH
+    #define USE_GRAPH
+#endif
+
+// sequence 0: TIME_GPU can be open or not
+#ifndef TIME_GPU
+    #define TIME_GPU
+#endif
+
+// sequence 1: TIME_GPU and USE_GRAPH can not be open at the same time
+#if (defined TIME_GPU) && (defined USE_GRAPH)
+    #undef USE_GRAPH
+#endif
+
+// sequence 2: STREAM_ALLOCATOR must be open if USE_GRAPH is open
+#if (defined USE_GRAPH) && (!defined STREAM_ALLOCATOR)
+    #define STREAM_ALLOCATOR
+#endif
+
+extern int myRank;
+
+#define PRINT_PTR(x) { \
+    fprintf(stderr, "rank[%d], %s %d, print ptr %s: %p\n", myRank, __FILE__, __LINE__, #x, x); \
+}
+
+extern __global__ void warmup();
+
+#ifdef TIME_GPU
+    #define WARM_UP \
+        warmup<<<10, 1024, 0, stream>>>();
+
+    #define TICK_INIT_EVENT \
+        float time_elapsed_kernel=0;\
+        cudaEvent_t start_kernel, stop_kernel;\
+        checkCudaErrors(cudaEventCreate(&start_kernel));\
+        checkCudaErrors(cudaEventCreate(&stop_kernel));
+
+    #define TICK_START_EVENT \
+        checkCudaErrors(cudaEventRecord(start_kernel,stream));
+
+    #define TICK_END_EVENT(prefix) \
+        checkCudaErrors(cudaEventRecord(stop_kernel,stream));\
+        checkCudaErrors(cudaEventSynchronize(start_kernel));\
+        checkCudaErrors(cudaEventSynchronize(stop_kernel));\
+        checkCudaErrors(cudaEventElapsedTime(&time_elapsed_kernel,start_kernel,stop_kernel));\
+        fprintf(stderr, "rank[%d], name: %s, time: %lf(ms)\n", myRank, #prefix, time_elapsed_kernel);
+
+/*
+    // the usage description:
+    // if you want to profile the first kernel, please use WARM_UP before TICK_INIT_EVENT.
+    // otherwise there is no need to use WARM_UP
+    WARM_UP;
+    // init event
+    TICK_INIT_EVENT;
+    // start event
+    TICK_START_EVENT;
+    // call your kernel, or kernels, or wrapper functions, e.g.:
+    my_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, input, output);
+    // end event with your specified name string, e.g.:
+    TICK_END_EVENT(my_kernel);
+*/
+
+#else
+    #define WARM_UP
+    #define TICK_INIT_EVENT
+    #define TICK_START_EVENT
+    #define TICK_END_EVENT(prefix)
+#endif
+
+// tools
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output);
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output);
+
+void field_add_scalar(cudaStream_t stream,
+        int num, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output);
+
+void field_add_vector(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output, double sign = 1.);
+
+void field_add_vector(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output, double sign = 1.);
+
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output);
+
+void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output,
+        int num_boundary_surfaces, const double *boundary_vec_input, double *boundary_scalar_output);
+
+void scalar_field_multiply_vector_field(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output, double sign = 1.);
+
+void scalar_field_multiply_vector_field(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output, double sign = 1.);
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source);
+
+void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign = 1.);
+
+void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
+        const int* boundary_cell_face, const int *ldu_to_csr_index,
+        int num_patches, const int *patch_size, const int *patch_type,
+        double* ldu, double *source, // b = source
+        const double *internal_coeffs, const double *boundary_coeffs, double *A);
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A);
+
+void update_boundary_coeffs_scalar(cudaStream_t stream,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_delta_coeffs, const double *boundary_vf, const double *boundary_weight, 
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs, const double *energy_gradient = nullptr);
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_vf, 
+        const double *boundary_deltaCoeffs, const double *boundary_weight,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs);
+
+void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_t comm,
+        int peer, int num, int offset,
+        const double *vf, const int *boundary_cell_face, double *vf_boundary);
+
+void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
+        const int *neighbor_peer, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_delta_coeffs, 
+        const int *boundary_cell_face, const double *vf, double *boundary_vf,
+        const int *cyclicNeighbor, const int *patchSizeOffset, const double *boundary_weight,
+        const double *boundary_T = nullptr, const double *boundary_y = nullptr,
+        const double *thermo_gradient = nullptr, dfThermo *GPUThermo = nullptr);
+
+void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm,
+        const int *neighbor_peer, int num_boundary_surfaces, int num_cells, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_weight, 
+        const int *boundary_cell_face, const double *vf, double *boundary_vf,
+        const int *cyclicNeighbor, const int *patchSizeOffset);
+
+void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double *phi, double *weight);
+
+void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_surfaces, int num_cells, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *mesh_distance, 
+        const double *weight, const double *Sf, const double *vf, const double *phi,  double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, 
+        // const double *boundary_distance, double *boundary_output, 
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs);
+
+// fvm ops
+
+void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign = 1.);
+
+void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, 
+        const double *vf_old, const double *volume, 
+        double *diag, double *source, double sign = 1.);
+
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign = 1.);
+
+void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_sourfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign = 1.);
+
+// fvc ops
+// fvc_ddt doesn't consider to add fvc_output to source yet, which needs (fvc_output * volume * sign).
+void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, 
+        double *output, double sign = 1.);
+
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *vf, const double *vf_old, const double *volume, double *source, double sign);
+
+void fvc_ddt_scalar_field(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *vf, const double *vf_old, const double *volume, double *source, double sign = 1.);
+
+void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, 
+        int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *neighbor_peer, const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *boundary_weight, 
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs, double sign = 1.);
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_ssf, const double *volume, double *output, double sign = 1.);
+
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face,
+        const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign = 1.);
+
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, 
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign = 1.);
+
+void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *vf, const double *ssf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, 
+        double sign = 1.);
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign = 1.);
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, 
+        bool dividVol, double sign = 1.);
+
+void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
+        int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs);
+
+void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *volume,
+        const double *gamma, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_gamma, const double *boundary_vf, double sign = 1.);
+
+void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, 
+        double *boundary_output, double sign);
+
+void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_vf, double *boundary_output, double sign);
+
+// others
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2);
+
+void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, 
+        double *A_pEqn);
+
+void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, 
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        double *H_pEqn, double *H_pEqn_perm);
+
+void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *lower, const double *upper,
+        const double *psi, double *output, //end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coeffs, const double *boundary_coeffs, 
+        const int *cyclicNeighbor, const int *patchSizeOffset, const double *boundary_psi, double *boundary_output);
+
+void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
+        double *psi);
+
diff --git a/src_gpu/dfMatrixOpBase.cu b/src_gpu/dfMatrixOpBase.cu
new file mode 100644
index 000000000..679418df7
--- /dev/null
+++ b/src_gpu/dfMatrixOpBase.cu
@@ -0,0 +1,3349 @@
+#include "dfMatrixOpBase.H"
+#include "dfMatrixDataBase.H"
+#include "dfNcclBase.H"
+
+#include <cuda_runtime.h>
+#include "cuda_profiler_api.h"
+
+using std::min;
+using std::max;
+
+__global__ void warmup()
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= 10240)
+        return;
+}
+
+__global__ void permute_vector_d2h_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[index * 3 + 0] = input[num_cells * 0 + index];
+    output[index * 3 + 1] = input[num_cells * 1 + index];
+    output[index * 3 + 2] = input[num_cells * 2 + index];
+}
+
+__global__ void permute_vector_h2d_kernel(int num_cells, const double *input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    output[num_cells * 0 + index] = input[index * 3 + 0];
+    output[num_cells * 1 + index] = input[index * 3 + 1];
+    output[num_cells * 2 + index] = input[index * 3 + 2];
+}
+
+__global__ void field_add_scalar_kernel(int num_cells, int num_boundary_surfaces,
+        const double *input1, const double *input2, double *output,
+        const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[index] = input1[index] + input2[index];
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[index] = boundary_input1[index] + boundary_input2[index];
+    }
+}
+
+__global__ void field_add_vector_kernel(int num_cells, int num_boundary_surfaces,
+        const double *input1, const double *input2, double *output,
+        const double *boundary_input1, const double *boundary_input2, double *boundary_output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[num_cells * 0 + index] = input1[num_cells * 0 + index] + input2[num_cells * 0 + index] * sign;
+        output[num_cells * 1 + index] = input1[num_cells * 1 + index] + input2[num_cells * 1 + index] * sign;
+        output[num_cells * 2 + index] = input1[num_cells * 2 + index] + input2[num_cells * 2 + index] * sign;
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[num_boundary_surfaces * 0 + index] = boundary_input1[num_boundary_surfaces * 0 + index] + boundary_input2[num_boundary_surfaces * 0 + index] * sign;
+        boundary_output[num_boundary_surfaces * 1 + index] = boundary_input1[num_boundary_surfaces * 1 + index] + boundary_input2[num_boundary_surfaces * 1 + index] * sign;
+        boundary_output[num_boundary_surfaces * 2 + index] = boundary_input1[num_boundary_surfaces * 2 + index] + boundary_input2[num_boundary_surfaces * 2 + index] * sign;
+    }
+}
+
+__global__ void field_add_vector_kernel_internal(int num_cells,
+        const double *input1, const double *input2, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[num_cells * 0 + index] = input1[num_cells * 0 + index] + input2[num_cells * 0 + index] * sign;
+        output[num_cells * 1 + index] = input1[num_cells * 1 + index] + input2[num_cells * 1 + index] * sign;
+        output[num_cells * 2 + index] = input1[num_cells * 2 + index] + input2[num_cells * 2 + index] * sign;
+    }
+}
+
+__global__ void field_multiply_scalar_kernel(int num_cells, int num_boundary_surfaces,
+        const double *input1, const double *input2, double *output,
+        const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[index] = input1[index] * input2[index];
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[index] = boundary_input1[index] * boundary_input2[index];
+    }
+}
+
+__global__ void vector_half_magSqr_kernal(int num_cells, int num_boundary_surfaces,
+        const double *vec_input, double *scalar_output, 
+        const double *boundary_vec_input, double *boundary_scalar_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        scalar_output[index] = 0.5 * (vec_input[num_cells * 0 + index] * vec_input[num_cells * 0 + index] +
+                    vec_input[num_cells * 1 + index] * vec_input[num_cells * 1 + index] +
+                    vec_input[num_cells * 2 + index] * vec_input[num_cells * 2 + index]);
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_scalar_output[index] = 0.5 * (boundary_vec_input[num_boundary_surfaces * 0 + index] * boundary_vec_input[num_boundary_surfaces * 0 + index] +
+                    boundary_vec_input[num_boundary_surfaces * 1 + index] * boundary_vec_input[num_boundary_surfaces * 1 + index] +
+                    boundary_vec_input[num_boundary_surfaces * 2 + index] * boundary_vec_input[num_boundary_surfaces * 2 + index]);
+    }
+}
+
+__global__ void scalar_multiply_vector_kernel(int num_cells, int num_boundary_surfaces,
+        const double *scalar_input, const double *vector_input, double *output,
+        const double *scalar_boundary_input, const double *vector_boundary_input, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[num_cells * 0 + index] = scalar_input[index] * vector_input[num_cells * 0 + index];
+        output[num_cells * 1 + index] = scalar_input[index] * vector_input[num_cells * 1 + index];
+        output[num_cells * 2 + index] = scalar_input[index] * vector_input[num_cells * 2 + index];
+    }
+    if (index < num_boundary_surfaces) {
+        boundary_output[num_boundary_surfaces * 0 + index] = scalar_boundary_input[index] * vector_boundary_input[num_boundary_surfaces * 0 + index];
+        boundary_output[num_boundary_surfaces * 1 + index] = scalar_boundary_input[index] * vector_boundary_input[num_boundary_surfaces * 1 + index];
+        boundary_output[num_boundary_surfaces * 2 + index] = scalar_boundary_input[index] * vector_boundary_input[num_boundary_surfaces * 2 + index];
+    }
+}
+
+__global__ void scalar_multiply_vector_internal_kernel(int num_cells,
+        const double *scalar_input, const double *vector_input, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index < num_cells) {
+        output[num_cells * 0 + index] = scalar_input[index] * vector_input[num_cells * 0 + index];
+        output[num_cells * 1 + index] = scalar_input[index] * vector_input[num_cells * 1 + index];
+        output[num_cells * 2 + index] = scalar_input[index] * vector_input[num_cells * 2 + index];
+    }
+}
+
+__global__ void fvc_to_source_vector_kernel(int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // source[index * 3 + 0] += fvc_output[index * 3 + 0] * volume[index];
+    // source[index * 3 + 1] += fvc_output[index * 3 + 1] * volume[index];
+    // source[index * 3 + 2] += fvc_output[index * 3 + 2] * volume[index];
+    source[index * 3 + 0] += fvc_output[index * 3 + 0];
+    source[index * 3 + 1] += fvc_output[index * 3 + 1];
+    source[index * 3 + 2] += fvc_output[index * 3 + 2];
+}
+
+__global__ void fvc_to_source_scalar_kernel(int num_cells, const double *volume, const double *fvc_output, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    source[index] += fvc_output[index] * volume[index] * sign;
+}
+
+__global__ void compute_upwind_weight_internal(int num_faces, const double *phi, double *weight)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_faces)
+        return;
+    if (phi[index] >= 0)
+        weight[index] = 1.;
+    else
+        weight[index] = 0.;
+}
+
+__device__ int sign(double x)
+{
+    return (x >= 0) ? 1: -1;
+}
+
+__device__ int pos0(double x)
+{
+    return (x >= 0) ? 1 : 0;
+}
+
+__global__ void compute_limiter_phi_internal(int num_cells, int num_surfaces, const double *vf, 
+        const int *lower_index, const int *upper_index, const double *mesh_distance, 
+        const double *phi, const double *mesh_weights, const double *gradc,
+        double *limiter)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    double faceFlux = phi[index];
+    double gradf = vf[neighbor] - vf[owner];
+    double gradcf, r;
+
+    // LimiterFunc::r
+    if (faceFlux > 0) {
+        gradcf = mesh_distance[index] * gradc[owner] + 
+                mesh_distance[num_surfaces + index] * gradc[num_cells + owner] +
+                mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + owner];
+    } else {
+        gradcf = mesh_distance[index] * gradc[neighbor] + 
+                mesh_distance[num_surfaces + index] * gradc[num_cells + neighbor] +
+                mesh_distance[num_surfaces * 2 + index] * gradc[num_cells * 2 + neighbor];
+    }
+    if (fabs(gradcf) >= 1000 * fabs(gradf)) {
+        r = 2*1000*sign(gradcf)*sign(gradf) - 1;
+    } else {
+        r = 2 * (gradcf / gradf) - 1;
+    }
+
+    limiter[index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; 
+}
+
+__global__ void compute_limiter_phi_boundary(int num, int offset, int num_boundary_surfaces, 
+        const double *boundary_weight, const double *boundary_vf, const double *boundary_gradc,
+        const double *boundary_distance, const double *boundary_phi, double *boundary_limiter)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + index + num;
+
+    double bouFaceFlux = boundary_phi[neighbor_start_index];
+    double bouGradf = boundary_vf[internal_start_index] - boundary_vf[neighbor_start_index];
+    double bouGradcf, r;
+
+
+    // LimiterFunc::r
+    if (bouFaceFlux > 0) {
+        bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[internal_start_index] + 
+                boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + internal_start_index] +
+                boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + internal_start_index];
+    } else {
+        bouGradcf = boundary_distance[neighbor_start_index] * boundary_gradc[neighbor_start_index] + 
+                boundary_distance[num_boundary_surfaces + neighbor_start_index] * boundary_gradc[num_boundary_surfaces + neighbor_start_index] +
+                boundary_distance[num_boundary_surfaces * 2 + neighbor_start_index] * boundary_gradc[num_boundary_surfaces * 2 + neighbor_start_index];
+    }
+    if (fabs(bouGradcf) >= 1000 * fabs(bouGradf)) {
+        r = 2*1000*sign(bouGradcf)*sign(bouGradf) - 1;
+    } else {
+        r = 2 * (bouGradcf / bouGradf) - 1;
+    }
+
+    boundary_limiter[neighbor_start_index] = max(min(r, 1.), 0.); // now twoByk_ = 1, fvScheme: limitedLinear 1; 
+}
+
+__global__ void compute_limiter_weight_internal(int num_cells, int num_surfaces,
+        const double *phi, const double *mesh_weights, const double *limiter_weights, double *output_weights)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double limiterW = limiter_weights[index];
+    output_weights[index] = limiterW * mesh_weights[index] +
+            (1. - limiterW) * pos0(phi[index]);
+}
+
+__global__ void compute_limiter_weight_boundary(int num, int offset, int num_boundary_surfaces, 
+        const double *boundary_weight, const double *boundary_phi, 
+        const double *boundary_limiter_weights, double *boundary_output_weights)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+
+    double limiterW = boundary_limiter_weights[neighbor_start_index];
+    boundary_output_weights[neighbor_start_index] = limiterW * boundary_weight[neighbor_start_index] +
+            (1. - limiterW) * pos0(boundary_phi[neighbor_start_index]);
+}
+
+__global__ void update_boundary_coeffs_zeroGradient_scalar(int num, int offset,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    // valueInternalCoeffs = 1
+    // valueBoundaryCoeffs = 0
+    // gradientInternalCoeffs = 0
+    // gradientBoundaryCoeffs = 0
+    value_internal_coeffs[start_index] = 1;
+    value_boundary_coeffs[start_index] = 0;
+    gradient_internal_coeffs[start_index] = 0;
+    gradient_boundary_coeffs[start_index] = 0;
+}
+
+__global__ void correct_boundary_conditions_zeroGradient_vector(int num, int offset,
+        int num_boundary_surfaces, int num_cells, 
+        const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+    vf_boundary[num_boundary_surfaces * 0 + start_index] = vf_internal[num_cells * 0 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 1 + start_index] = vf_internal[num_cells * 1 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 2 + start_index] = vf_internal[num_cells * 2 + cellIndex];
+}
+
+__global__ void correct_boundary_conditions_cyclic_vector(int num, int internal_offset, 
+        int neighbor_offset, int num_boundary_surfaces, int num_cells, 
+        const double *boundary_weight, const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
+
+    double weight = boundary_weight[internal_start_index];
+
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
+
+    vf_boundary[num_boundary_surfaces * 0 + internal_start_index] = weight * vf_internal[num_cells * 0 + internal_cellIndex] + 
+            (1 - weight) * vf_internal[num_cells * 0 + neighbor_cellIndex];
+    vf_boundary[num_boundary_surfaces * 1 + internal_start_index] = weight * vf_internal[num_cells * 1 + internal_cellIndex] + 
+            (1 - weight) * vf_internal[num_cells * 1 + neighbor_cellIndex];
+    vf_boundary[num_boundary_surfaces * 2 + internal_start_index] = weight * vf_internal[num_cells * 2 + internal_cellIndex] + 
+            (1 - weight) * vf_internal[num_cells * 2 + neighbor_cellIndex];
+}
+
+__global__ void correct_boundary_conditions_zeroGradient_scalar(int num, int offset,
+        const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+    vf_boundary[start_index] = vf_internal[cellIndex];
+}
+
+__global__ void correct_boundary_conditions_gradientEnergy_scalar(int num, int bou_offset,
+        int gradient_offset, const double *vf_internal, const int *face2Cells,
+        const double *thermo_gradient, const double *delta_coeffs, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int bou_start_index = bou_offset + index;
+    int gradient_start_index = gradient_offset + index;
+    int cellIndex = face2Cells[bou_start_index];
+
+    vf_boundary[bou_start_index] = vf_internal[cellIndex] + 
+            thermo_gradient[gradient_start_index] / delta_coeffs[bou_start_index];
+}
+
+__global__ void correct_boundary_conditions_cyclic_scalar(int num, int internal_offset,
+        int neighbor_offset, const double *vf_internal, const int *face2Cells,
+        const double *boundary_weight, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
+
+    double weight = boundary_weight[internal_start_index];
+
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
+
+    vf_boundary[internal_start_index] = weight * vf_internal[internal_cellIndex] + 
+            (1 - weight) * vf_internal[neighbor_cellIndex];
+}
+
+__global__ void correct_internal_boundary_field_scalar(int num, int offset,
+        const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    int cellIndex = face2Cells[neighbor_start_index];
+    vf_boundary[internal_start_index] = vf_internal[cellIndex];
+}
+
+__global__ void correct_internal_boundary_field_vector(int num, int offset,
+        int num_boundary_surfaces, int num_cells,
+        const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    int cellIndex = face2Cells[neighbor_start_index];
+    vf_boundary[num_boundary_surfaces * 0 + internal_start_index] = vf_internal[num_cells * 0 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 1 + internal_start_index] = vf_internal[num_cells * 1 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 2 + internal_start_index] = vf_internal[num_cells * 2 + cellIndex];
+}
+
+__global__ void correct_internal_boundary_field_tensor(int num, int offset,
+        int num_boundary_surfaces, int num_cells,
+        const double *vf_internal, const int *face2Cells, double *vf_boundary)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    int cellIndex = face2Cells[neighbor_start_index];
+    vf_boundary[num_boundary_surfaces * 0 + internal_start_index] = vf_internal[num_cells * 0 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 1 + internal_start_index] = vf_internal[num_cells * 1 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 2 + internal_start_index] = vf_internal[num_cells * 2 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 3 + internal_start_index] = vf_internal[num_cells * 3 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 4 + internal_start_index] = vf_internal[num_cells * 4 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 5 + internal_start_index] = vf_internal[num_cells * 5 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 6 + internal_start_index] = vf_internal[num_cells * 6 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 7 + internal_start_index] = vf_internal[num_cells * 7 + cellIndex];
+    vf_boundary[num_boundary_surfaces * 8 + internal_start_index] = vf_internal[num_cells * 8 + cellIndex];
+}
+
+void correct_boundary_conditions_processor_scalar(cudaStream_t stream, ncclComm_t comm,
+        int peer, int num, int offset, 
+        const double *vf, const int *boundary_cell_face, double *vf_boundary)
+{
+    int neighbor_start_index = offset;
+    int internal_start_index = offset + num;
+
+    size_t threads_per_block = 32;
+    size_t blocks_per_grid = (num + threads_per_block - 1) / threads_per_block;
+    correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
+            vf, boundary_cell_face, vf_boundary);
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    checkNcclErrors(ncclGroupStart());
+    checkNcclErrors(ncclSend(vf_boundary + internal_start_index, num, ncclDouble, peer, comm, stream));
+    checkNcclErrors(ncclRecv(vf_boundary + neighbor_start_index, num, ncclDouble, peer, comm, stream));
+    checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl scalar);
+    //checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+void correct_boundary_conditions_processor_vector(cudaStream_t stream, ncclComm_t comm,
+        int peer, int num, int offset, int num_boundary_surfaces, int num_cells,
+        const double *vf, const int *boundary_cell_face, double *vf_boundary)
+{
+    int neighbor_start_index = offset;
+    int internal_start_index = offset + num;
+
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num + threads_per_block - 1) / threads_per_block;
+    correct_internal_boundary_field_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
+            num_boundary_surfaces, num_cells, vf, boundary_cell_face, vf_boundary);
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    checkNcclErrors(ncclGroupStart());
+    for (int i = 0; i < 3; i++) {
+        checkNcclErrors(ncclSend(vf_boundary + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
+        checkNcclErrors(ncclRecv(vf_boundary + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
+    }
+    checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl vector);
+    //checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+__global__ void update_boundary_coeffs_fixedValue_scalar(int num, int offset,
+        const double *boundary_vf, const double *boundary_deltaCoeffs, 
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    value_internal_coeffs[start_index] = 0.;
+    value_boundary_coeffs[start_index] = boundary_vf[start_index];
+    gradient_internal_coeffs[start_index] = -1 * boundary_deltaCoeffs[start_index];
+    gradient_boundary_coeffs[start_index] = boundary_vf[start_index] * boundary_deltaCoeffs[start_index];
+}
+
+__global__ void update_boundary_coeffs_gradientEnergy_scalar(int num, int offset, int gradient_offset, 
+        const double *gradient, const double *boundary_deltaCoeffs, 
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    int gradient_start_index = gradient_offset + index;
+
+    double grad = gradient[gradient_start_index];
+
+    value_internal_coeffs[start_index] = 1.;
+    value_boundary_coeffs[start_index] = grad / boundary_deltaCoeffs[start_index];
+    gradient_internal_coeffs[start_index] = 0.;
+    gradient_boundary_coeffs[start_index] = grad;
+}
+
+__global__ void update_boundary_coeffs_processor_scalar(int num, int offset,
+        const double *boundary_weight, const double *boundary_deltaCoeffs,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouDeltaCoeffs = boundary_deltaCoeffs[start_index];
+    double bouWeight = boundary_weight[start_index];
+
+    value_internal_coeffs[start_index] = bouWeight;
+    value_boundary_coeffs[start_index] = 1 - bouWeight;
+    gradient_internal_coeffs[start_index] = -1 * bouDeltaCoeffs;
+    gradient_boundary_coeffs[start_index] = bouDeltaCoeffs;
+}
+
+__global__ void update_boundary_coeffs_zeroGradient_vector(int num_boundary_surfaces, int num, int offset,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    // valueInternalCoeffs = 1
+    // valueBoundaryCoeffs = 0
+    // gradientInternalCoeffs = 0
+    // gradientBoundaryCoeffs = 0
+    value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 1;
+    value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 1;
+    value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 0;
+    gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 0;
+}
+
+__global__ void update_boundary_coeffs_fixedValue_vector(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_vf, const double *boundary_deltaCoeffs, 
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double bouDeltaCoeffs = boundary_deltaCoeffs[start_index];
+
+    value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = 0.; // valueInternalCoeffs = 0.
+    value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = 0.;
+    value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = 0.;
+    value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = boundary_vf[num_boundary_surfaces * 0 + start_index]; // valueBoundaryCoeffs = boundaryValue
+    value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = boundary_vf[num_boundary_surfaces * 1 + start_index];
+    value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = boundary_vf[num_boundary_surfaces * 2 + start_index];
+    gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs
+    gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = -1 * bouDeltaCoeffs;
+    gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = -1 * bouDeltaCoeffs;
+    gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = bouDeltaCoeffs * boundary_vf[num_boundary_surfaces * 0 + start_index]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue
+    gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = bouDeltaCoeffs * boundary_vf[num_boundary_surfaces * 1 + start_index];
+    gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = bouDeltaCoeffs * boundary_vf[num_boundary_surfaces * 2 + start_index];
+}
+
+__global__ void update_boundary_coeffs_processor_vector(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_weight, const double *boundary_deltaCoeffs, 
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double bouWeight = boundary_weight[start_index];
+    double bouDeltaCoeffs = boundary_deltaCoeffs[start_index];
+
+    value_internal_coeffs[num_boundary_surfaces * 0 + start_index] = bouWeight; // valueInternalCoeffs = Type(pTraits<Type>::one)*w
+    value_internal_coeffs[num_boundary_surfaces * 1 + start_index] = bouWeight;
+    value_internal_coeffs[num_boundary_surfaces * 2 + start_index] = bouWeight;
+    value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = 1 - bouWeight; // valueBoundaryCoeffs = Type(pTraits<Type>::one)*(1.0 - w)
+    value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = 1 - bouWeight;
+    value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = 1 - bouWeight;
+    gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -Type(pTraits<Type>::one)*deltaCoeffs
+    gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] = -1 * bouDeltaCoeffs;
+    gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] = -1 * bouDeltaCoeffs;
+    gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] = bouDeltaCoeffs; // gradientBoundaryCoeffs = -this->gradientInternalCoeffs(deltaCoeffs)
+    gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] = bouDeltaCoeffs;
+    gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] = bouDeltaCoeffs; 
+}
+
+__global__ void scale_dev2t_tensor_kernel(int num, const double *vf1, double *vf2)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    double scale = vf1[index];
+    double val_xx = vf2[num * 0 + index];
+    double val_xy = vf2[num * 1 + index];
+    double val_xz = vf2[num * 2 + index];
+    double val_yx = vf2[num * 3 + index];
+    double val_yy = vf2[num * 4 + index];
+    double val_yz = vf2[num * 5 + index];
+    double val_zx = vf2[num * 6 + index];
+    double val_zy = vf2[num * 7 + index];
+    double val_zz = vf2[num * 8 + index];
+    double trace_coeff = (2. / 3.) * (val_xx + val_yy + val_zz);
+    vf2[num * 0 + index] = scale * (val_xx - trace_coeff);
+    vf2[num * 1 + index] = scale * val_yx;
+    vf2[num * 2 + index] = scale * val_zx;
+    vf2[num * 3 + index] = scale * val_xy;
+    vf2[num * 4 + index] = scale * (val_yy - trace_coeff);
+    vf2[num * 5 + index] = scale * val_zy;
+    vf2[num * 6 + index] = scale * val_xz;
+    vf2[num * 7 + index] = scale * val_yz;
+    vf2[num * 8 + index] = scale * (val_zz - trace_coeff);
+
+    // if (index == 0)
+    // {
+    //     printf("bou_grad_U = (%.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e, %.5e)", vf2[0], vf2[1], vf2[2],
+    //             vf2[3], vf2[4], vf2[5], vf2[6], vf2[7], vf2[8]);
+    // }
+    
+}
+
+__global__ void fvm_ddt_vol_scalar_vol_scalar_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    diag[index] += rDeltaT * rho[index] * volume[index] * sign;
+    // TODO: skip moving
+    source[index] += rDeltaT * rho_old[index] * vf[index] * volume[index] * sign;
+}
+
+__global__ void fvm_ddt_scalar_kernel(int num_cells, double rDeltaT,
+        const double *vf_old, const double *volume, 
+        double *diag, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+    
+    diag[index] += rDeltaT * vol * sign;
+    source[index] += rDeltaT * vf_old[index] * vol * sign;
+}
+
+__global__ void fvm_ddt_vector_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    double vol = volume[index];
+    double rho_old_kernel = rho_old[index];
+
+    diag[index] += rDeltaT * rho[index] * vol * sign;
+    // TODO: skip moving
+    source[num_cells * 0 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 0 + index] * vol * sign;
+    source[num_cells * 1 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 1 + index] * vol * sign;
+    source[num_cells * 2 + index] += rDeltaT * rho_old_kernel * vf[num_cells * 2 + index] * vol * sign;    
+}
+
+// same with fvm_div_vector_internal
+__global__ void fvm_div_scalar_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double f = phi[index];
+
+    double lower_value = (-w) * f * sign;
+    double upper_value = (1 - w) * f * sign;
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+__global__ void fvm_div_scalar_boundary(int num, int offset,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_f = boundary_phi[start_index];
+    internal_coeffs[start_index] += boundary_f * value_internal_coeffs[start_index] * sign;
+    boundary_coeffs[start_index] -= boundary_f * value_boundary_coeffs[start_index] * sign;
+}
+
+__global__ void fvm_div_vector_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double f = phi[index];
+
+    double lower_value = (-w) * f * sign;
+    double upper_value = (1 - w) * f * sign;
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+// TODO: modify the data structure of internal and boundary coeffs
+__global__ void fvm_div_vector_boundary(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_f = boundary_phi[start_index];
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_f * value_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] -= boundary_f * value_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] -= boundary_f * value_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] -= boundary_f * value_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+}
+
+__global__ void fvm_laplacian_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double w = weight[index];
+    double face_gamma = w * gamma[owner] + (1 - w) * gamma[neighbor];
+
+    // for fvm::laplacian, lower = upper
+    double upper_value = face_gamma * mag_sf[index] * delta_coeffs[index];
+    double lower_value = upper_value;
+
+    lower_value = lower_value * sign;
+    upper_value = upper_value * sign;
+
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+__global__ void fvm_laplacian_surface_scalar_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double face_gamma = gamma[index];
+
+    // for fvm::laplacian, lower = upper
+    double upper_value = face_gamma * mag_sf[index] * delta_coeffs[index];
+    double lower_value = upper_value;
+
+    lower_value = lower_value * sign;
+    upper_value = upper_value * sign;
+
+    lower[index] += lower_value;
+    upper[index] += upper_value;
+
+    atomicAdd(&(diag[owner]), -lower_value);
+    atomicAdd(&(diag[neighbor]), -upper_value);
+}
+
+__global__ void fvm_laplacian_scalar_boundary(int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    internal_coeffs[start_index] += boundary_value * gradient_internal_coeffs[start_index] * sign;
+    boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign; 
+}
+
+__global__ void fvm_laplacian_surface_scalar_boundary(int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    internal_coeffs[start_index] += boundary_value * gradient_internal_coeffs[start_index] * sign;
+    boundary_coeffs[start_index] -= boundary_value * gradient_boundary_coeffs[start_index] * sign;
+}
+
+__global__ void fvm_laplacian_vector_boundary(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+}
+
+__global__ void fvm_laplacian_vector_boundary_tmp(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    double boundary_value = boundary_gamma[start_index] * boundary_mag_sf[start_index];
+    
+    internal_coeffs[num_boundary_surfaces * 0 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 1 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    internal_coeffs[num_boundary_surfaces * 2 + start_index] += boundary_value * gradient_internal_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 0 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 0 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 1 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 1 + start_index] * sign;
+    boundary_coeffs[num_boundary_surfaces * 2 + start_index] -= boundary_value * gradient_boundary_coeffs[num_boundary_surfaces * 2 + start_index] * sign;
+}
+
+__global__ void fvc_ddt_vol_scalar_vol_scalar_kernel(int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, 
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    /*
+    // workaround way1 (use printf):
+    double val_new = rho[index] * vf[index];
+    double val_old = rho_old[index] * vf_old[index];
+    // TODO: skip moving
+    // TODO: wyr
+    // for the case of rho = rho_old and vf = vf_old, the floating-point numerical problem will be exposed.
+    // it expect zero as output, but the gpu result get a sub-normal minimal value for (val_new - val_old),
+    // which smaller than 1e-16, and then enlarged by rDeltaT (1e6)
+    // then the comparison of cpu result and gpu result will failed with relative error: inf,
+    // e.g.:
+    // cpu data: 0.0000000000000000, gpu data: 0.0000000000298050, relative error: inf
+    // if I add the print line for intermediate variables of val_new and val_old, the problem disappears.
+    // It seems that print line will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    if (index == -1) printf("index = 0, val_new: %.40lf, val_old: %.40lf\n", val_new, val_old);
+    output[index] += rDeltaT * (val_new - val_old);
+    */
+    /*
+    // workaround way2 (use volatile):
+    // volatile will change the compiler behavior, maybe avoiding the fma optimization of compiler.
+    volatile double val_new = rho[index] * vf[index];
+    volatile double val_old = rho_old[index] * vf_old[index];
+    output[index] += rDeltaT * (val_new - val_old);
+    */
+    // workaround way3 (use nvcc option -fmad=false)
+    output[index] += rDeltaT * (rho[index] * vf[index] - rho_old[index] * vf_old[index]) * volume[index] * sign;
+}
+
+__global__ void fvc_grad_vector_internal(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *face_vector,
+        const double *weight, const double *field_vector, 
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);    
+
+    double grad_xx = Sfx * ssfx;
+    double grad_xy = Sfx * ssfy;
+    double grad_xz = Sfx * ssfz;
+    double grad_yx = Sfy * ssfx;
+    double grad_yy = Sfy * ssfy;
+    double grad_yz = Sfy * ssfz;
+    double grad_zx = Sfz * ssfx;
+    double grad_zy = Sfz * ssfy;
+    double grad_zz = Sfz * ssfz;
+
+    // // owner
+    // atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
+
+    // // neighbour
+    // atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    // atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    // atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    // atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    // atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    // atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    // atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    // atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    // atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
+
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_xx);
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_xx);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_xy);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_xy);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_xz);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_xz);
+    atomicAdd(&(output[num_cells * 3 + owner]), grad_yx);
+    atomicAdd(&(output[num_cells * 3 + neighbor]), -grad_yx);
+    atomicAdd(&(output[num_cells * 4 + owner]), grad_yy);
+    atomicAdd(&(output[num_cells * 4 + neighbor]), -grad_yy);
+    atomicAdd(&(output[num_cells * 5 + owner]), grad_yz);
+    atomicAdd(&(output[num_cells * 5 + neighbor]), -grad_yz);
+    atomicAdd(&(output[num_cells * 6 + owner]), grad_zx);
+    atomicAdd(&(output[num_cells * 6 + neighbor]), -grad_zx);
+    atomicAdd(&(output[num_cells * 7 + owner]), grad_zy);
+    atomicAdd(&(output[num_cells * 7 + neighbor]), -grad_zy);
+    atomicAdd(&(output[num_cells * 8 + owner]), grad_zz);
+    atomicAdd(&(output[num_cells * 8 + neighbor]), -grad_zz);
+}
+
+// update boundary of interpolation field
+// calculate the grad field
+// TODO: this function is implemented for uncoupled boundary conditions
+//       so it should use the more specific func name
+__global__ void fvc_grad_vector_boundary_zeroGradient(int num_boundary_surfaces, int num_cells, int num, 
+        int offset, const int *face2Cells, const double *boundary_face_vector, 
+        const double *boundary_field_vector, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    double boussfx = boundary_field_vector[num_boundary_surfaces * 0 + start_index];
+    double boussfy = boundary_field_vector[num_boundary_surfaces * 1 + start_index];
+    double boussfz = boundary_field_vector[num_boundary_surfaces * 2 + start_index];
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = bouSfx * boussfx;
+    double grad_xy = bouSfx * boussfy;
+    double grad_xz = bouSfx * boussfz;
+    double grad_yx = bouSfy * boussfx;
+    double grad_yy = bouSfy * boussfy;
+    double grad_yz = bouSfy * boussfz;
+    double grad_zx = bouSfz * boussfx;
+    double grad_zy = bouSfz * boussfy;
+    double grad_zz = bouSfz * boussfz;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz);
+    atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx);
+    atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy);
+    atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz);
+    atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx);
+    atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy);
+    atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz);
+}
+
+__global__ void fvc_grad_vector_boundary_processor(int num_boundary_surfaces, int num_cells, int num, 
+        int offset, const int *face2Cells, const double *boundary_face_vector, 
+        const double *boundary_weight, const double *boundary_field_vector, double *output) 
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+    
+    double boussfx = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 0 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 0 + internal_start_index];
+    double boussfy = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 1 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 1 + internal_start_index];
+    double boussfz = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 2 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 2 + internal_start_index];
+
+    int cellIndex = face2Cells[neighbor_start_index];
+
+    double grad_xx = bouSfx * boussfx;
+    double grad_xy = bouSfx * boussfy;
+    double grad_xz = bouSfx * boussfz;
+    double grad_yx = bouSfy * boussfx;
+    double grad_yy = bouSfy * boussfy;
+    double grad_yz = bouSfy * boussfz;
+    double grad_zx = bouSfz * boussfx;
+    double grad_zy = bouSfz * boussfy;
+    double grad_zz = bouSfz * boussfz;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_xx);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_xy);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_xz);
+    atomicAdd(&(output[num_cells * 3 + cellIndex]), grad_yx);
+    atomicAdd(&(output[num_cells * 4 + cellIndex]), grad_yy);
+    atomicAdd(&(output[num_cells * 5 + cellIndex]), grad_yz);
+    atomicAdd(&(output[num_cells * 6 + cellIndex]), grad_zx);
+    atomicAdd(&(output[num_cells * 7 + cellIndex]), grad_zy);
+    atomicAdd(&(output[num_cells * 8 + cellIndex]), grad_zz);
+}
+
+__global__ void fvc_grad_scalar_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *face_vector, 
+        const double *weight, const double *vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+
+    double grad_x = Sfx * ssf * sign;
+    double grad_y = Sfy * ssf * sign;
+    double grad_z = Sfz * ssf * sign;
+
+    // owner
+    atomicAdd(&(output[num_cells * 0 + owner]), grad_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), grad_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), grad_z);
+
+    // neighbour
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -grad_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -grad_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -grad_z);
+}
+
+__global__ void fvc_grad_scalar_boundary_zeroGradient(int num_boundary_surfaces, int num_cells, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouvf = boundary_vf[start_index];
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_x = bouSfx * bouvf;
+    double grad_y = bouSfy * bouvf;
+    double grad_z = bouSfz * bouvf;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign);
+}
+
+__global__ void fvc_grad_scalar_boundary_processor(int num_boundary_surfaces, int num_cells, int num, 
+        int offset, const int *face2Cells, const double *boundary_face_vector, 
+        const double *boundary_weight, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+
+    double bouvf = (1 - bouWeight) * boundary_vf[neighbor_start_index] + bouWeight * boundary_vf[internal_start_index];
+
+    int cellIndex = face2Cells[neighbor_start_index];
+
+    double grad_x = bouSfx * bouvf;
+    double grad_y = bouSfy * bouvf;
+    double grad_z = bouSfz * bouvf;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), grad_x * sign);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), grad_y * sign);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), grad_z * sign);
+}
+
+__global__ void divide_cell_volume_tsr(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol;
+    output[num_cells * 3 + index] = output[num_cells * 3 + index] / vol;
+    output[num_cells * 4 + index] = output[num_cells * 4 + index] / vol;
+    output[num_cells * 5 + index] = output[num_cells * 5 + index] / vol;
+    output[num_cells * 6 + index] = output[num_cells * 6 + index] / vol;
+    output[num_cells * 7 + index] = output[num_cells * 7 + index] / vol;
+    output[num_cells * 8 + index] = output[num_cells * 8 + index] / vol;
+}
+
+__global__ void divide_cell_volume_vec(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol;
+}
+
+__global__ void divide_cell_volume_scalar(int num_cells, const double* volume, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double vol = volume[index];
+
+    output[index] = output[index] / vol;
+}
+
+__global__ void fvc_grad_vector_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, 
+        const double *internal_grad, const double *boundary_vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[num_cells * 0 + cellIndex];
+    double grad_xy = internal_grad[num_cells * 1 + cellIndex];
+    double grad_xz = internal_grad[num_cells * 2 + cellIndex];
+    double grad_yx = internal_grad[num_cells * 3 + cellIndex];
+    double grad_yy = internal_grad[num_cells * 4 + cellIndex];
+    double grad_yz = internal_grad[num_cells * 5 + cellIndex];
+    double grad_zx = internal_grad[num_cells * 6 + cellIndex];
+    double grad_zy = internal_grad[num_cells * 7 + cellIndex];
+    double grad_zz = internal_grad[num_cells * 8 + cellIndex];
+
+    double n_x = boundary_sf[num_boundary_surfaces * 0 + start_index] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[num_boundary_surfaces * 1 + start_index] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[num_boundary_surfaces * 2 + start_index] / boundary_mag_sf[start_index];
+    
+    double grad_correction_x = - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = grad_xx + n_x * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = grad_xy + n_x * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = grad_xz + n_x * grad_correction_z;
+    boundary_grad[num_boundary_surfaces * 3 + start_index] = grad_yx + n_y * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 4 + start_index] = grad_yy + n_y * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 5 + start_index] = grad_yz + n_y * grad_correction_z;
+    boundary_grad[num_boundary_surfaces * 6 + start_index] = grad_zx + n_z * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 7 + start_index] = grad_zy + n_z * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 8 + start_index] = grad_zz + n_z * grad_correction_z;
+}
+
+__global__ void fvc_grad_vector_correctBC_fixedValue(int num_cells, int num_boundary_surfaces,
+        int num, int offset, const int *face2Cells,
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad,
+        const double *boundary_deltaCoeffs, const double *boundary_vf)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    int cellIndex = face2Cells[start_index];
+
+    double grad_xx = internal_grad[num_cells * 0 + cellIndex];
+    double grad_xy = internal_grad[num_cells * 1 + cellIndex];
+    double grad_xz = internal_grad[num_cells * 2 + cellIndex];
+    double grad_yx = internal_grad[num_cells * 3 + cellIndex];
+    double grad_yy = internal_grad[num_cells * 4 + cellIndex];
+    double grad_yz = internal_grad[num_cells * 5 + cellIndex];
+    double grad_zx = internal_grad[num_cells * 6 + cellIndex];
+    double grad_zy = internal_grad[num_cells * 7 + cellIndex];
+    double grad_zz = internal_grad[num_cells * 8 + cellIndex];
+
+    double n_x = boundary_sf[num_boundary_surfaces * 0 + start_index] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[num_boundary_surfaces * 1 + start_index] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[num_boundary_surfaces * 2 + start_index] / boundary_mag_sf[start_index];
+    
+    // sn_grad: solving according to fixedValue BC
+    double sn_grad_x = boundary_deltaCoeffs[start_index] * (boundary_vf[num_boundary_surfaces * 0 + start_index] - vf[num_cells * 0 + cellIndex]);
+    double sn_grad_y = boundary_deltaCoeffs[start_index] * (boundary_vf[num_boundary_surfaces * 1 + start_index] - vf[num_cells * 1 + cellIndex]);
+    double sn_grad_z = boundary_deltaCoeffs[start_index] * (boundary_vf[num_boundary_surfaces * 2 + start_index] - vf[num_cells * 2 + cellIndex]);
+
+    double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx); // sn_grad_x = 0
+    double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
+    double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
+
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = grad_xx + n_x * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = grad_xy + n_x * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = grad_xz + n_x * grad_correction_z;
+    boundary_grad[num_boundary_surfaces * 3 + start_index] = grad_yx + n_y * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 4 + start_index] = grad_yy + n_y * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 5 + start_index] = grad_yz + n_y * grad_correction_z;
+    boundary_grad[num_boundary_surfaces * 6 + start_index] = grad_zx + n_z * grad_correction_x;
+    boundary_grad[num_boundary_surfaces * 7 + start_index] = grad_zy + n_z * grad_correction_y;
+    boundary_grad[num_boundary_surfaces * 8 + start_index] = grad_zz + n_z * grad_correction_z;
+}
+
+__global__ void fvc_grad_vector_correctBC_cyclic(int num_cells, int num_boundary_surfaces,
+        int num, int internal_offset, int neighbor_offset, const int *face2Cells,
+        const double *boundary_weight, const double *internal_grad, double *boundary_grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
+
+    double weight = boundary_weight[internal_start_index];
+
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
+
+    boundary_grad[num_boundary_surfaces * 0 + internal_start_index] = weight * internal_grad[num_cells * 0 + internal_cellIndex] 
+            + (1 - weight) * internal_grad[num_cells * 0 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 1 + internal_start_index] = weight * internal_grad[num_cells * 1 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 1 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 2 + internal_start_index] = weight * internal_grad[num_cells * 2 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 2 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 3 + internal_start_index] = weight * internal_grad[num_cells * 3 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 3 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 4 + internal_start_index] = weight * internal_grad[num_cells * 4 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 4 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 5 + internal_start_index] = weight * internal_grad[num_cells * 5 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 5 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 6 + internal_start_index] = weight * internal_grad[num_cells * 6 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 6 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 7 + internal_start_index] = weight * internal_grad[num_cells * 7 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 7 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 8 + internal_start_index] = weight * internal_grad[num_cells * 8 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 8 + neighbor_cellIndex];
+
+}
+
+void fvc_grad_vector_correctBC_processor(cudaStream_t stream, ncclComm_t comm,
+        int peer, int num, int offset, int num_cells, int num_boundary_surfaces,
+        const int *face2Cells, const double *internal_grad, double *boundary_grad)
+{
+    int neighbor_start_index = offset;
+    int internal_start_index = offset + num;
+
+    size_t threads_per_block = 32;
+    size_t blocks_per_grid = (num + threads_per_block - 1) / threads_per_block;
+
+    correct_internal_boundary_field_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, offset, 
+            num_boundary_surfaces, num_cells, internal_grad, face2Cells, boundary_grad);
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    checkNcclErrors(ncclGroupStart());
+    for (int i = 0; i < 9; i++) {
+        checkNcclErrors(ncclSend(boundary_grad + num_boundary_surfaces * i + internal_start_index, num, ncclDouble, peer, comm, stream));
+        checkNcclErrors(ncclRecv(boundary_grad + num_boundary_surfaces * i + neighbor_start_index, num, ncclDouble, peer, comm, stream));   
+    }
+    checkNcclErrors(ncclGroupEnd());
+    TICK_END_EVENT(nccl tensor);
+    //checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+__global__ void fvc_grad_cell_scalar_correctBC_zeroGradient(int num_cells, int num_boundary_surfaces,
+        int num, int offset, const int *face2Cells,
+        const double *internal_grad, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    int cellIndex = face2Cells[start_index];
+
+    double grad_x = internal_grad[num_cells * 0 + cellIndex];
+    double grad_y = internal_grad[num_cells * 1 + cellIndex];
+    double grad_z = internal_grad[num_cells * 2 + cellIndex];
+
+    double n_x = boundary_sf[num_boundary_surfaces * 0 + start_index] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[num_boundary_surfaces * 1 + start_index] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[num_boundary_surfaces * 2 + start_index] / boundary_mag_sf[start_index];
+
+    double grad_correction = -(n_x * grad_x + n_y * grad_y + n_z * grad_z); // sn_grad_x = 0
+
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = grad_x + n_x * grad_correction;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = grad_y + n_y * grad_correction;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = grad_z + n_z * grad_correction;
+}
+
+__global__ void fvc_grad_cell_scalar_correctBC_fixedValue(int num_cells, int num_boundary_surfaces,
+        int num, int offset, const int *face2Cells,
+        const double *internal_grad, const double *vf, const double *boundary_sf,
+        const double *boundary_mag_sf, double *boundary_grad,
+        const double *boundary_deltaCoeffs, const double *boundary_vf)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    int cellIndex = face2Cells[start_index];
+
+    double grad_x = internal_grad[num_cells * 0 + cellIndex];
+    double grad_y = internal_grad[num_cells * 1 + cellIndex];
+    double grad_z = internal_grad[num_cells * 2 + cellIndex];
+
+    double n_x = boundary_sf[num_boundary_surfaces * 0 + start_index] / boundary_mag_sf[start_index];
+    double n_y = boundary_sf[num_boundary_surfaces * 1 + start_index] / boundary_mag_sf[start_index];
+    double n_z = boundary_sf[num_boundary_surfaces * 2 + start_index] / boundary_mag_sf[start_index];
+
+    // sn_grad: solving according to fixedValue BC
+    double sn_grad = boundary_deltaCoeffs[start_index] * (boundary_vf[start_index] - vf[cellIndex]);
+    double grad_correction = sn_grad - (n_x * grad_x + n_y * grad_y + n_z * grad_z);
+
+    boundary_grad[num_boundary_surfaces * 0 + start_index] = grad_x + n_x * grad_correction;
+    boundary_grad[num_boundary_surfaces * 1 + start_index] = grad_y + n_y * grad_correction;
+    boundary_grad[num_boundary_surfaces * 2 + start_index] = grad_z + n_z * grad_correction;
+}
+
+__global__ void fvc_grad_cell_scalar_correctBC_cyclic(int num_cells, int num_boundary_surfaces,
+        int num, int internal_offset, int neighbor_offset, const int *face2Cells,
+        const double *boundary_weight, const double *internal_grad, double *boundary_grad)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
+
+    double weight = boundary_weight[internal_start_index];
+
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
+
+    boundary_grad[num_boundary_surfaces * 0 + internal_start_index] = weight * internal_grad[num_cells * 0 + internal_cellIndex] 
+            + (1 - weight) * internal_grad[num_cells * 0 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 1 + internal_start_index] = weight * internal_grad[num_cells * 1 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 1 + neighbor_cellIndex];
+    boundary_grad[num_boundary_surfaces * 2 + internal_start_index] = weight * internal_grad[num_cells * 2 + internal_cellIndex]
+            + (1 - weight) * internal_grad[num_cells * 2 + neighbor_cellIndex];
+}
+
+__global__ void fvc_div_surface_scalar_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *ssf,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double issf = ssf[index];
+
+    // owner
+    atomicAdd(&(output[owner]), issf * sign);
+
+    // neighbor
+    atomicAdd(&(output[neighbor]), -issf * sign);
+}
+
+__global__ void fvc_div_surface_scalar_vol_scalar_internal(int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *weight,
+        const double *faceFlux, const double *vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double w = weight[index];
+    double flux = faceFlux[index] * (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+
+    // owner
+    atomicAdd(&(output[owner]), flux * sign);
+
+    // neighbor
+    atomicAdd(&(output[neighbor]), -flux * sign);
+}
+
+__global__ void fvc_div_surface_scalar_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_ssf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+    
+    int cellIndex = face2Cells[start_index];
+
+    atomicAdd(&(output[cellIndex]), boundary_ssf[start_index] * sign);
+}
+
+__global__ void fvc_div_surface_scalar_vol_scalar_boundary(int num, int offset, const int *face2Cells,
+        const double *boundary_vf, const double *boundary_ssf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+    
+    int cellIndex = face2Cells[start_index];
+
+    atomicAdd(&(output[cellIndex]), boundary_ssf[start_index] * boundary_vf[start_index] * sign);
+}
+
+__global__ void fvc_div_cell_vector_internal(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index,
+        const double *field_vector, const double *weight, const double *face_vector,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);
+
+    double div = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz;
+
+    // owner
+    atomicAdd(&(output[owner]), div * sign);
+
+    // neighbour
+    atomicAdd(&(output[neighbor]), -div * sign);
+}
+
+__global__ void fvc_div_cell_vector_boundary(int num_boundary_surfaces, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_field_vector, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    double boussfx = boundary_field_vector[num_boundary_surfaces * 0 + start_index];
+    double boussfy = boundary_field_vector[num_boundary_surfaces * 1 + start_index];
+    double boussfz = boundary_field_vector[num_boundary_surfaces * 2 + start_index];
+
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz;
+
+    atomicAdd(&(output[cellIndex]), bouDiv * sign);
+}
+
+__global__ void fvc_div_cell_vector_boundary_processor(int num_boundary_surfaces, int num, int offset, const int *face2Cells,
+        const double *boundary_weight, const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+
+    double boussfx = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 0 + neighbor_start_index] +
+        bouWeight * boundary_vf[num_boundary_surfaces * 0 + internal_start_index];
+    double boussfy = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 1 + neighbor_start_index] +
+        bouWeight * boundary_vf[num_boundary_surfaces * 1 + internal_start_index];
+    double boussfz = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 2 + neighbor_start_index] +
+        bouWeight * boundary_vf[num_boundary_surfaces * 2 + internal_start_index];
+
+    int cellIndex = face2Cells[neighbor_start_index];
+
+    double bouDiv = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz;
+    atomicAdd(&(output[cellIndex]), bouDiv * sign);
+}
+
+
+__global__ void fvc_div_cell_tensor_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *vf, const double *weight, const double *face_vector,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssf_xx = (w * (vf[num_cells * 0 + owner] - vf[num_cells * 0 + neighbor]) + vf[num_cells * 0 + neighbor]);
+    double ssf_xy = (w * (vf[num_cells * 1 + owner] - vf[num_cells * 1 + neighbor]) + vf[num_cells * 1 + neighbor]);
+    double ssf_xz = (w * (vf[num_cells * 2 + owner] - vf[num_cells * 2 + neighbor]) + vf[num_cells * 2 + neighbor]);
+    double ssf_yx = (w * (vf[num_cells * 3 + owner] - vf[num_cells * 3 + neighbor]) + vf[num_cells * 3 + neighbor]);
+    double ssf_yy = (w * (vf[num_cells * 4 + owner] - vf[num_cells * 4 + neighbor]) + vf[num_cells * 4 + neighbor]);
+    double ssf_yz = (w * (vf[num_cells * 5 + owner] - vf[num_cells * 5 + neighbor]) + vf[num_cells * 5 + neighbor]);
+    double ssf_zx = (w * (vf[num_cells * 6 + owner] - vf[num_cells * 6 + neighbor]) + vf[num_cells * 6 + neighbor]);
+    double ssf_zy = (w * (vf[num_cells * 7 + owner] - vf[num_cells * 7 + neighbor]) + vf[num_cells * 7 + neighbor]);
+    double ssf_zz = (w * (vf[num_cells * 8 + owner] - vf[num_cells * 8 + neighbor]) + vf[num_cells * 8 + neighbor]);
+    double div_x = (Sfx * ssf_xx + Sfy * ssf_yx + Sfz * ssf_zx) * sign;
+    double div_y = (Sfx * ssf_xy + Sfy * ssf_yy + Sfz * ssf_zy) * sign;
+    double div_z = (Sfx * ssf_xz + Sfy * ssf_yz + Sfz * ssf_zz) * sign;
+    
+    // owner
+    atomicAdd(&(output[num_cells * 0 + owner]), div_x);
+    atomicAdd(&(output[num_cells * 1 + owner]), div_y);
+    atomicAdd(&(output[num_cells * 2 + owner]), div_z);
+
+    // neighbour
+    atomicAdd(&(output[num_cells * 0 + neighbor]), -div_x);
+    atomicAdd(&(output[num_cells * 1 + neighbor]), -div_y);
+    atomicAdd(&(output[num_cells * 2 + neighbor]), -div_z);
+}
+
+__global__ void fvc_div_cell_tensor_boundary_zeroGradient(int num_cells, int num_boundary_faces, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_faces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_faces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_faces * 2 + start_index];
+
+    double boussf_xx = boundary_vf[num_boundary_faces * 0 + start_index];
+    double boussf_xy = boundary_vf[num_boundary_faces * 1 + start_index];
+    double boussf_xz = boundary_vf[num_boundary_faces * 2 + start_index];
+    double boussf_yx = boundary_vf[num_boundary_faces * 3 + start_index];
+    double boussf_yy = boundary_vf[num_boundary_faces * 4 + start_index];
+    double boussf_yz = boundary_vf[num_boundary_faces * 5 + start_index];
+    double boussf_zx = boundary_vf[num_boundary_faces * 6 + start_index];
+    double boussf_zy = boundary_vf[num_boundary_faces * 7 + start_index];
+    double boussf_zz = boundary_vf[num_boundary_faces * 8 + start_index];
+    int cellIndex = face2Cells[start_index];
+
+    double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign;
+    double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign;
+    double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z);
+}
+
+__global__ void fvc_div_cell_tensor_boundary_processor(int num_cells, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, const double *boundary_weight, 
+        const double *boundary_face_vector, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+
+    double boussf_xx = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 0 + neighbor_start_index] + 
+            bouWeight * boundary_vf[num_boundary_surfaces * 0 + internal_start_index];
+    double boussf_xy = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 1 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 1 + internal_start_index];
+    double boussf_xz = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 2 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 2 + internal_start_index];
+    double boussf_yx = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 3 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 3 + internal_start_index];
+    double boussf_yy = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 4 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 4 + internal_start_index];
+    double boussf_yz = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 5 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 5 + internal_start_index];
+    double boussf_zx = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 6 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 6 + internal_start_index];
+    double boussf_zy = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 7 + neighbor_start_index] +
+            bouWeight * boundary_vf[num_boundary_surfaces * 7 + internal_start_index];
+    double boussf_zz = (1 - bouWeight) * boundary_vf[num_boundary_surfaces * 8 + neighbor_start_index] + 
+            bouWeight * boundary_vf[num_boundary_surfaces * 8 + internal_start_index];
+    int cellIndex = face2Cells[neighbor_start_index];
+
+    double bouDiv_x = (bouSfx * boussf_xx + bouSfy * boussf_yx + bouSfz * boussf_zx) * sign;
+    double bouDiv_y = (bouSfx * boussf_xy + bouSfy * boussf_yy + bouSfz * boussf_zy) * sign;
+    double bouDiv_z = (bouSfx * boussf_xz + bouSfy * boussf_yz + bouSfz * boussf_zz) * sign;
+
+    atomicAdd(&(output[num_cells * 0 + cellIndex]), bouDiv_x);
+    atomicAdd(&(output[num_cells * 1 + cellIndex]), bouDiv_y);
+    atomicAdd(&(output[num_cells * 2 + cellIndex]), bouDiv_z);
+}
+
+__global__ void fvc_laplacian_scalar_internal(int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *mag_sf, const double *delta_coeffs,
+        const double *gamma, const double *vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double sngrad = delta_coeffs[index] * (vf[neighbor] - vf[owner]);
+    double issf = gamma[index] * sngrad * mag_sf[index] * sign;
+
+    // owner
+    atomicAdd(&(output[owner]), issf);
+
+    // neighbor
+    atomicAdd(&(output[neighbor]), -issf);
+}
+
+__global__ void fvc_laplacian_scalar_boundary_fixedValue(int num, int offset, const int *face2Cells,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_gamma, const double *vf, const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    int cellIndex = face2Cells[index];
+
+    // sn_grad: solving according to fixedValue BC
+    double boundary_sngrad = boundary_delta_coeffs[start_index] * (boundary_vf[start_index] - vf[cellIndex]);
+    double boundary_ssf = boundary_gamma[start_index] * boundary_sngrad * boundary_mag_sf[start_index] * sign;
+
+    atomicAdd(&(output[cellIndex]), boundary_ssf);
+}
+
+__global__ void fvc_flux_internal_kernel(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index,
+        const double *field_vector, const double *weight, const double *face_vector,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);
+
+    output[index] = Sfx * ssfx + Sfy * ssfy + Sfz * ssfz;
+}
+
+__global__ void fvc_interpolate_internal_kernel(int num_surfaces, const int *lower_index, const int *upper_index,
+        const double *vf, const double *weight, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    output[index] = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+}
+
+__global__ void fvc_flux_boundary_kernel(int num_boundary_surfaces, int num, int offset, const int *face2Cells,
+        const double *boundary_face_vector, const double *boundary_field_vector, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    double boussfx = boundary_field_vector[num_boundary_surfaces * 0 + start_index];
+    double boussfy = boundary_field_vector[num_boundary_surfaces * 1 + start_index];
+    double boussfz = boundary_field_vector[num_boundary_surfaces * 2 + start_index];
+
+    output[start_index] = bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz;
+}
+
+__global__ void fvc_interpolate_boundary_kernel_upCouple(int num, int offset,
+        const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+    output[start_index] = boundary_vf[start_index];
+}
+
+__global__ void fvc_ddt_scalar_kernel(int num_cells, const double *vf, const double *vf_old,
+        const double rDeltaT, const double *volume, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    source[index] += (vf[index] - vf_old[index]) * rDeltaT * volume[index] * sign;
+}
+
+__global__ void fvc_ddt_scalar_field_kernel(int num_cells, const double *vf, const double *vf_old,
+        const double rDeltaT, const double *volume, double *source, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    source[index] = (vf[index] - vf_old[index]) * rDeltaT * sign;
+}
+
+__global__ void add_external_entry_kernal(int num, int bou_offset, 
+        int external_offset, const double *boundary_coeffs, double *external)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int bou_start_index = bou_offset + index;
+    int external_start_index = external_offset + index;
+    external[external_start_index] = - boundary_coeffs[bou_start_index];
+}
+
+__global__ void addBoundaryDiagSrc_scalar_couple(int num, int offset, const int *face2Cells, 
+        const double *internal_coeffs, double *diag)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int startIndex = offset + index;
+    int cellIndex = face2Cells[startIndex];
+
+    double internalCoeff = internal_coeffs[startIndex];
+
+    atomicAdd(&diag[cellIndex], internalCoeff);
+}
+
+__global__ void addBoundaryDiagSrc_scalar_couple_processorCyclic(int num, int offset, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, double *diag, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int startIndex = offset + index;
+    int cellIndex = face2Cells[startIndex];
+
+    double internalCoeff = internal_coeffs[startIndex];
+    double boundaryCoeff = boundary_coeffs[startIndex];
+
+    atomicAdd(&diag[cellIndex], internalCoeff);
+    atomicAdd(&source[cellIndex], 2 * boundaryCoeff);
+}
+
+__global__ void addBoundaryDiagSrc_scalar_unCouple(int num, int offset, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs,
+        double *diag, double *source)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int startIndex = offset + index;
+    int cellIndex = face2Cells[startIndex];
+
+    double internalCoeff = internal_coeffs[startIndex];
+    double boundaryCoeff = boundary_coeffs[startIndex];
+
+    atomicAdd(&diag[cellIndex], internalCoeff);
+    atomicAdd(&source[cellIndex], boundaryCoeff);
+}
+
+__global__ void addBoundaryDiagSrc(int num_cells, int num_surfaces, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, 
+        double *A_csr, double *b)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+    
+    int cellIndex = face2Cells[index];
+    int diagIndex = diagCSRIndex[cellIndex];
+    int nNz = num_cells + 2 * num_surfaces;
+
+    double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + index];
+    double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + index];
+    double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + index];
+
+    atomicAdd(&A_csr[nNz * 0 + diagIndex], internalCoeffx);
+    atomicAdd(&A_csr[nNz * 1 + diagIndex], internalCoeffy);
+    atomicAdd(&A_csr[nNz * 2 + diagIndex], internalCoeffz);
+
+    atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx);
+    atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy);
+    atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz);
+}
+
+__global__ void ldu_to_csr_scalar_kernel(int nNz, const int *ldu_to_csr_index,
+        const double *ldu, double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= nNz)
+        return;
+
+    int lduIndex = ldu_to_csr_index[index];
+    double csrVal = ldu[lduIndex];
+    A_csr[index] = csrVal;
+}
+
+__global__ void ldu_to_csr_kernel(int nNz, const int *ldu_to_csr_index, 
+        const double *ldu, double *A_csr)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= nNz)
+        return;
+    
+    int lduIndex = ldu_to_csr_index[index];
+    double csrVal = ldu[lduIndex];
+    A_csr[nNz * 0 + index] = csrVal;
+    A_csr[nNz * 1 + index] = csrVal;
+    A_csr[nNz * 2 + index] = csrVal;
+}
+
+__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, double *A_pEqn)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+
+    int cellIndex = face2Cells[index];
+
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    atomicAdd(&A_pEqn[cellIndex], ave_internal);
+}
+
+__global__ void addBoundaryDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *psi, double *H_pEqn)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_boundary_surfaces)
+        return;
+
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
+
+    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+
+    int cellIndex = face2Cells[index];
+
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], (-internal_x + ave_internal) * psi[num_cells * 0 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], (-internal_y + ave_internal) * psi[num_cells * 1 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], (-internal_z + ave_internal) * psi[num_cells * 2 + cellIndex]);
+}
+
+__global__ void lduMatrix_H(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *lower, const double *upper,
+        const double *psi, double *H_pEqn)
+{
+    /*
+    for (label face=0; face<nFaces; face++)
+    {
+        HpsiPtr[uPtr[face]] -= lowerPtr[face]*psiPtr[lPtr[face]];
+        HpsiPtr[lPtr[face]] -= upperPtr[face]*psiPtr[uPtr[face]];
+    }*/
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    int l = lower_index[index];
+    int u = upper_index[index];
+
+    atomicAdd(&H_pEqn[num_cells * 0 + u], -lower[index] * psi[num_cells * 0 + l]);
+    atomicAdd(&H_pEqn[num_cells * 1 + u], -lower[index] * psi[num_cells * 1 + l]);
+    atomicAdd(&H_pEqn[num_cells * 2 + u], -lower[index] * psi[num_cells * 2 + l]);
+    atomicAdd(&H_pEqn[num_cells * 0 + l], -upper[index] * psi[num_cells * 0 + u]);
+    atomicAdd(&H_pEqn[num_cells * 1 + l], -upper[index] * psi[num_cells * 1 + u]);
+    atomicAdd(&H_pEqn[num_cells * 2 + l], -upper[index] * psi[num_cells * 2 + u]);
+}
+
+__global__ void addBoundarySrc_unCoupled(int num_cells, int num, int offset, 
+        int num_boundary_surfaces, const int *face2Cells, const double *boundary_coeffs, double *H_pEqn)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + start_index];
+    double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + start_index];
+    double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + start_index];
+
+
+    int cellIndex = face2Cells[start_index];
+
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], boundary_x);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], boundary_y);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], boundary_z);
+}
+
+__global__ void divideVol_permute_vec(int num_cells, const double *volume, double *H_pEqn, double *H_pEqn_perm)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    // divide volume
+    double vol = volume[index];
+    double H_pEqn_x = H_pEqn[num_cells * 0 + index] / vol;
+    double H_pEqn_y = H_pEqn[num_cells * 1 + index] / vol;
+    double H_pEqn_z = H_pEqn[num_cells * 2 + index] / vol;
+
+    // permute
+    H_pEqn_perm[index * 3 + 0] = H_pEqn_x;
+    H_pEqn_perm[index * 3 + 1] = H_pEqn_y;
+    H_pEqn_perm[index * 3 + 2] = H_pEqn_z;
+}
+
+__global__ void solve_explicit_scalar_kernel(int num_cells, const double *diag, const double *source, double *psi)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    psi[index] = source[index] / diag[index];
+}
+
+__global__ void lduMatrix_faceH(int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *lower, const double *upper,
+        const double *psi, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    int l = lower_index[index];
+    int u = upper_index[index];
+
+    output[index] = upper[index] * psi[u] - lower[index] * psi[l];    
+}
+
+__global__ void boundary_flux_couple_process(int num, int offset, const int *face2cells, 
+        const double *boundary_psi, const double *internal_coeffs, 
+        const double *boundary_coeffs, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double internal_contrib = boundary_psi[internal_start_index] * internal_coeffs[neighbor_start_index];
+    double neighbor_contrib = boundary_psi[neighbor_start_index] * boundary_coeffs[neighbor_start_index];
+
+    boundary_output[neighbor_start_index] = internal_contrib - neighbor_contrib;
+}
+
+__global__ void boundary_flux_couple_cyclic(int num, int internal_offset, 
+        int neighbor_offset, const int *face2cells, 
+        const double *psi, const double *internal_coeffs, 
+        const double *boundary_coeffs, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
+
+    int internal_cellIndex = face2cells[internal_start_index];
+    int neighbor_cellIndex = face2cells[neighbor_start_index];
+
+    double internal_contrib = psi[internal_cellIndex] * internal_coeffs[internal_start_index];
+    double neighbor_contrib = psi[neighbor_cellIndex] * boundary_coeffs[internal_start_index];
+
+    boundary_output[neighbor_start_index] = internal_contrib - neighbor_contrib;
+}
+
+__global__ void boundary_flux_unCouple(int num, int offset, const int *face2cells, 
+        const double *psi, const double *internal_coeffs, double *boundary_output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+
+    int start_index = offset + index;
+    int cellIndex = face2cells[start_index];
+
+    // internalCoeffs_ * patchInternalField
+    boundary_output[start_index] = psi[cellIndex] * internal_coeffs[start_index]; 
+}
+
+
+void permute_vector_d2h(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_d2h_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
+void permute_vector_h2d(cudaStream_t stream, int num_cells, const double *input, double *output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    permute_vector_h2d_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input, output);
+}
+
+void field_add_scalar(cudaStream_t stream,
+        int num, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    field_add_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num, num_boundary_surfaces,
+            input1, input2, output, boundary_input1, boundary_input2, boundary_output);
+}
+
+void field_add_vector(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    field_add_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+            input1, input2, output, boundary_input1, boundary_input2, boundary_output, sign);
+}
+
+void field_add_vector(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+
+    field_add_vector_kernel_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, input1, input2, output, sign);
+}
+
+void field_multiply_scalar(cudaStream_t stream,
+        int num_cells, const double *input1, const double *input2, double *output,
+        int num_boundary_surfaces, const double *boundary_input1, const double *boundary_input2, double *boundary_output)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+    field_multiply_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+            input1, input2, output, boundary_input1, boundary_input2, boundary_output);
+}
+
+void vector_half_mag_square(cudaStream_t stream, int num_cells, const double *vec_input, double *scalar_output,
+        int num_boundary_surfaces, const double *boundary_vec_input, double *boundary_scalar_output)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+
+    vector_half_magSqr_kernal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, 
+            vec_input, scalar_output, boundary_vec_input, boundary_scalar_output);
+}
+
+void scalar_field_multiply_vector_field(cudaStream_t stream,
+        int num_cells, const double *scalar_input, const double *vector_input, double *output,
+        int num_boundary_surfaces, const double *scalar_boundary_input, const double *vector_boundary_input, double *boundary_output, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (std::max(num_cells, num_boundary_surfaces) + threads_per_block - 1) / threads_per_block;
+
+    scalar_multiply_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+            scalar_input, vector_input, output, scalar_boundary_input, vector_boundary_input, boundary_output);
+}
+
+void scalar_field_multiply_vector_field(cudaStream_t stream,
+        int num_cells, const double *scalar_input, const double *vector_input, double *output, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+
+    scalar_multiply_vector_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
+            scalar_input, vector_input, output);
+}
+
+void fvc_to_source_vector(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_to_source_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            volume, fvc_output, source);
+}
+
+void fvc_to_source_scalar(cudaStream_t stream, int num_cells, const double *volume, const double *fvc_output, double *source, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_to_source_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            volume, fvc_output, source, sign);
+}
+
+void ldu_to_csr_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
+        const int* boundary_cell_face, const int *ldu_to_csr_index,
+        int num_patches, const int *patch_size, const int *patch_type,
+        double* ldu, double *source, // b = source
+        const double *internal_coeffs, const double *boundary_coeffs, double *A)
+{
+    double *diag = ldu + num_surfaces;
+    double *external = ldu + num_cells + 2 * num_surfaces;
+
+    // add external to ldu
+    int bou_offset = 0, ext_offset = 0;
+    size_t threads_per_block, blocks_per_grid;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        if (patch_type[i] == boundaryConditions::processor
+                || patch_type[i] == boundaryConditions::processorCyclic) {
+            threads_per_block = 64;
+            blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+            add_external_entry_kernal<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    ext_offset, boundary_coeffs, external);
+            bou_offset += patch_size[i] * 2;
+            ext_offset += patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            threads_per_block = 64;
+            blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+            add_external_entry_kernal<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    ext_offset, boundary_coeffs, external);
+            bou_offset += patch_size[i];
+            ext_offset += patch_size[i];
+        } else {
+            bou_offset += patch_size[i];
+        }
+    }
+
+    // add coeff to source and diagnal
+    bou_offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            addBoundaryDiagSrc_scalar_couple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    boundary_cell_face, internal_coeffs, diag);
+            bou_offset += patch_size[i] * 2;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            addBoundaryDiagSrc_scalar_couple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    boundary_cell_face, internal_coeffs, diag);
+            bou_offset += patch_size[i];
+        } else {
+            addBoundaryDiagSrc_scalar_unCouple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    boundary_cell_face, internal_coeffs, boundary_coeffs, diag, source);
+            bou_offset += patch_size[i];
+        }
+    }
+
+    // construct csr
+    threads_per_block = 1024;
+    blocks_per_grid = (num_Nz + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_Nz, ldu_to_csr_index, ldu, A);
+}
+
+void ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface,
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        const double *ldu, const double *internal_coeffs, const double *boundary_coeffs, double *source, double *A)
+{
+    // construct diag
+    int nNz = num_cells + 2 * num_surfaces;
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (nNz + threads_per_block - 1) / threads_per_block;
+    ldu_to_csr_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(nNz, ldu_to_csr_index, ldu, A);
+
+    // add coeff to source and diagnal
+    blocks_per_grid = (num_boundary_surface + threads_per_block - 1) / threads_per_block;
+    addBoundaryDiagSrc<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, num_boundary_surface, 
+            boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, source);
+}
+
+void update_boundary_coeffs_scalar(cudaStream_t stream,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_delta_coeffs, const double *boundary_vf, const double *boundary_weight,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs, const double *energy_gradient)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    int gradient_offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            update_boundary_coeffs_zeroGradient_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::fixedValue
+                    || patch_type[i] == boundaryConditions::fixedEnergy) {
+            update_boundary_coeffs_fixedValue_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_vf, boundary_delta_coeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::gradientEnergy) {
+            update_boundary_coeffs_gradientEnergy_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, gradient_offset,
+                    energy_gradient, boundary_delta_coeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+            gradient_offset += patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            update_boundary_coeffs_processor_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_weight, boundary_delta_coeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::processor) {
+            update_boundary_coeffs_processor_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_weight, boundary_delta_coeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+            offset += 2 * patch_size[i];
+            continue;
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            update_boundary_coeffs_processor_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_weight, boundary_delta_coeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void correct_boundary_conditions_scalar(cudaStream_t stream, ncclComm_t comm,
+        const int *neighbor_peer, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_delta_coeffs, 
+        const int *boundary_cell_face, const double *vf, double *boundary_vf,
+        const int *cyclicNeighbor, const int *patchSizeOffset, const double *boundary_weight,
+        const double *boundary_T, const double *boundary_y, 
+        const double *thermo_gradient, dfThermo *GPUThermo)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    int gradient_offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::extrapolated) {
+            correct_boundary_conditions_zeroGradient_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    vf, boundary_cell_face, boundary_vf);
+        } else if (patch_type[i] == boundaryConditions::fixedValue
+                    || patch_type[i] == boundaryConditions::calculated) {
+            // No operation needed in this condition
+        } else if (patch_type[i] == boundaryConditions::processor) {
+            correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    vf, boundary_cell_face, boundary_vf);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    vf, boundary_cell_face, boundary_vf);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::gradientEnergy) {
+            correct_boundary_conditions_gradientEnergy_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    gradient_offset, vf, boundary_cell_face, thermo_gradient, boundary_delta_coeffs, boundary_vf);
+            gradient_offset += patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::fixedEnergy) {
+            GPUThermo->calculateEnthalpyGPU(threads_per_block, patch_size[i], num_boundary_surfaces, boundary_T, boundary_vf, boundary_y, offset);
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            correct_boundary_conditions_cyclic_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    patchSizeOffset[cyclicNeighbor[i]], vf, boundary_cell_face, boundary_weight, boundary_vf);
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions %d are not support yet!\n", __FILE__, __LINE__, patch_type[i]);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void correct_boundary_conditions_vector(cudaStream_t stream, ncclComm_t comm,
+        const int *neighbor_peer, int num_boundary_surfaces, int num_cells, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_weight, 
+        const int *boundary_cell_face, const double *vf, double *boundary_vf,
+        const int *cyclicNeighbor, const int *patchSizeOffset)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::extrapolated) {
+            correct_boundary_conditions_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    num_boundary_surfaces, num_cells, vf, boundary_cell_face, boundary_vf);
+        } else if (patch_type[i] == boundaryConditions::fixedValue
+                    || patch_type[i] == boundaryConditions::calculated) {
+            // No operation needed in this condition
+        } else if (patch_type[i] == boundaryConditions::processor) {
+            correct_boundary_conditions_processor_vector(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    num_boundary_surfaces, num_cells, vf, boundary_cell_face, boundary_vf);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            correct_boundary_conditions_processor_vector(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    num_boundary_surfaces, num_cells, vf, boundary_cell_face, boundary_vf);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            correct_boundary_conditions_cyclic_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    patchSizeOffset[cyclicNeighbor[i]], num_boundary_surfaces, num_cells, boundary_weight, vf, boundary_cell_face, boundary_vf);
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void update_boundary_coeffs_vector(cudaStream_t stream, int num_boundary_surfaces, int num_patches,
+        const int *patch_size, const int *patch_type, const double *boundary_vf, 
+        const double *boundary_deltaCoeffs, const double *boundary_weight,
+        double *value_internal_coeffs, double *value_boundary_coeffs,
+        double *gradient_internal_coeffs, double *gradient_boundary_coeffs)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = 1;
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        // TODO: just vector version now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            update_boundary_coeffs_zeroGradient_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            update_boundary_coeffs_fixedValue_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_vf, boundary_deltaCoeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            update_boundary_coeffs_processor_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_weight, boundary_deltaCoeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            update_boundary_coeffs_processor_vector<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_weight, boundary_deltaCoeffs, value_internal_coeffs, value_boundary_coeffs, gradient_internal_coeffs, gradient_boundary_coeffs);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void compute_upwind_weight(cudaStream_t stream, int num_surfaces, const double *phi, double *weight)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    // only need internal upwind-weight
+    compute_upwind_weight_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, phi, weight);
+}
+
+void compute_limitedLinear_weight(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_surfaces, int num_cells, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *mesh_distance, 
+        const double *weight, const double *Sf, const double *vf, const double *phi,  double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, const double *boundary_phi, 
+        // const double *boundary_distance, double *boundary_output, 
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs)
+{
+    // calculate fvc::grad(vf) (now output stores fvc::grad(lPhi))
+    // fvc_grad_cell_scalar_withBC(stream, comm, neighbor_peer, num_cells, num_surfaces, num_boundary_surfaces,
+    //         lowerAddr, upperAddr, weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight,
+    //         boundary_cell_face, boundary_vf, boundary_Sf, volume, boundary_mag_Sf, boundary_output,
+    //         cyclicNeighbor, patchSizeOffset, boundary_deltaCoeffs);
+    fvc_grad_cell_scalar(stream, num_cells, num_surfaces, num_boundary_surfaces, lowerAddr, upperAddr, 
+            weight, Sf, vf, output, num_patches, patch_size, patch_type, boundary_weight,
+            boundary_cell_face, boundary_vf, boundary_Sf, volume, true);
+    // calculated limiter (now output stores this->limiter(phi))
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    compute_limiter_phi_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, vf, 
+            lowerAddr, upperAddr, mesh_distance, phi, weight, output, output);
+    
+    // int offset = 0;
+    // for (int i = 0; i < num_patches; i++) {
+    //     if (patch_size[i] == 0) continue;
+    //     if (patch_type[i] == boundaryConditions::processor
+    //         || patch_type[i] == boundaryConditions::processorCyclic) {
+    //         threads_per_block = 256;
+    //         blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+    //         compute_limiter_phi_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+    //                 num_boundary_surfaces, boundary_weight, boundary_vf, boundary_output, boundary_distance, 
+    //                 boundary_phi, boundary_output);
+    //         offset += 2 * patch_size[i];
+    //     } else {
+    //         cudaMemset(boundary_output + offset, 1., patch_size[i] * sizeof(double));
+    //         offset += patch_size[i];
+    //     }
+    // }
+    // calculate weight
+    // threads_per_block = 1024;
+    // blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    // compute_limiter_weight_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, phi, 
+    //         weight, output, output);
+    // offset = 0;
+    // for (int i = 0; i < num_patches; i++) {
+    //     if (patch_size[i] == 0) continue;
+    //     threads_per_block = 256;
+    //     blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+    //     compute_limiter_weight_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+    //             num_boundary_surfaces, boundary_weight, boundary_phi, boundary_output, boundary_output);
+    //     if (patch_type[i] == boundaryConditions::processor
+    //         || patch_type[i] == boundaryConditions::processorCyclic) {
+    //         offset += 2 * patch_size[i];
+    //     } else {
+    //         offset += patch_size[i];
+    //     }
+    // }
+}
+
+void fvm_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
+}
+
+void fvm_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT, 
+        const double *vf_old, const double *volume, 
+        double *diag, double *source, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, 
+            rDeltaT, vf_old, volume, diag, source, sign);
+}
+
+void fvm_ddt_vector(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *volume,
+        double *diag, double *source, double sign)
+{
+    size_t threads_per_block = 64;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvm_ddt_vector_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, volume, diag, source, sign);
+}
+
+void fvm_div_scalar(cudaStream_t stream, int num_surfaces, const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_div_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            phi, weight, lower, upper, diag, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        fvm_div_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_phi, value_internal_coeffs, value_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs, sign);
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) offset += 2 * patch_size[i];
+        else offset += patch_size[i];
+    }
+}
+
+void fvm_div_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *phi, const double *weight,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_phi, const double *value_internal_coeffs, const double *value_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_div_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            phi, weight, lower, upper, diag, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        fvm_div_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_phi, value_internal_coeffs, value_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs, sign);
+        (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) ?
+            offset += 2 * patch_size[i] : offset += patch_size[i];
+    }
+}
+
+void fvm_laplacian_scalar(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        fvm_laplacian_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
+                internal_coeffs, boundary_coeffs, sign);
+        (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) ?
+            offset += 2 * patch_size[i] : offset += patch_size[i];
+    }
+}
+
+void fvm_laplacian_vector(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            weight, mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        fvm_laplacian_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
+                    internal_coeffs, boundary_coeffs, sign);
+        (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) ?
+            offset += 2 * patch_size[i] : offset += patch_size[i];
+    }
+}
+
+void fvm_laplacian_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *mag_sf, const double *delta_coeffs, const double *gamma,
+        double *lower, double *upper, double *diag, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_mag_sf, const double *boundary_gamma,
+        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
+        double *internal_coeffs, double *boundary_coeffs, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvm_laplacian_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            mag_sf, delta_coeffs, gamma, lower, upper, diag, sign);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        fvm_laplacian_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                boundary_mag_sf, boundary_gamma, gradient_internal_coeffs, gradient_boundary_coeffs,
+                internal_coeffs, boundary_coeffs, sign);
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            offset += 2 * patch_size[i];
+        } else {
+            offset += patch_size[i];
+        }
+    }
+}
+
+void fvc_ddt_vol_scalar_vol_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *rho, const double *rho_old, const double *vf, const double *vf_old, const double *volume, 
+        double *output, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_ddt_vol_scalar_vol_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
+            rDeltaT, rho, rho_old, vf, vf_old, volume, output, sign);
+}
+
+void fvc_grad_vector(cudaStream_t stream, ncclComm_t comm, 
+        int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *neighbor_peer, const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *boundary_weight, 
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs, double sign)
+{
+    size_t threads_per_block = 32;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output);
+    
+    int offset = 0;
+    // finish conctruct grad field except dividing cell volume
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated
+                || patch_type[i] == boundaryConditions::cyclic) {
+            // TODO: just vector version now
+            fvc_grad_vector_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
+                    patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_vf, output);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_grad_vector_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, 
+                    patch_size[i], offset, boundary_cell_face, boundary_Sf, boundary_weight, boundary_vf, output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+
+    // divide cell volume
+    threads_per_block = 512;
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_tsr<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+
+    // correct boundary conditions
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            // TODO: just vector version now
+            fvc_grad_vector_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    output, boundary_vf, boundary_Sf, boundary_mag_Sf, boundary_output);
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            // TODO: implement fixedValue version
+            fvc_grad_vector_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    output, vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf);
+        } else if (patch_type[i] == boundaryConditions::processor) {
+            fvc_grad_vector_correctBC_processor(stream, comm, neighbor_peer[i], patch_size[i], offset, num_cells, 
+                    num_boundary_surfaces, boundary_cell_face, output, boundary_output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_grad_vector_correctBC_processor(stream, comm, neighbor_peer[i], patch_size[i], offset, num_cells, 
+                    num_boundary_surfaces, boundary_cell_face, output, boundary_output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            fvc_grad_vector_correctBC_cyclic<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                num_cells, num_boundary_surfaces, patch_size[i], offset, patchSizeOffset[cyclicNeighbor[i]],
+                boundary_cell_face, boundary_weight, output, boundary_output);
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void scale_dev2T_tensor(cudaStream_t stream, int num_cells, const double *vf1, double *vf2,
+        int num_boundary_surfaces, const double *boundary_vf1, double *boundary_vf2)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf1, vf2);
+
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    scale_dev2t_tensor_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, boundary_vf1, boundary_vf2);
+}
+
+void fvc_div_surface_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, const double *ssf, const int *boundary_cell_face,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_ssf, const double *volume, double *output, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, ssf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        fvc_div_surface_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face, 
+                boundary_ssf, output, sign);
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            offset += 2 * patch_size[i];
+        } else {
+            offset += patch_size[i];
+        }
+    }
+}
+
+void fvc_div_cell_vector(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face,
+        const double *boundary_weight, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_cell_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
+            lowerAddr, upperAddr, vf, weight, Sf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::processor
+                || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_div_cell_vector_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_boundary_surfaces, patch_size[i], offset,
+                    boundary_cell_face, boundary_weight, boundary_Sf, boundary_vf, output, sign);
+            offset += 2 * patch_size[i];
+        } else {
+            fvc_div_cell_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_boundary_surfaces, patch_size[i], offset,
+                    boundary_cell_face, boundary_Sf, boundary_vf, output, sign);
+            offset += patch_size[i];
+        }
+    }
+}
+
+void fvc_div_cell_tensor(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight, 
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_cell_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
+            lowerAddr, upperAddr, vf, weight, Sf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated
+                || patch_type[i] == boundaryConditions::cyclic) {
+            // TODO: just vector version now
+            fvc_div_cell_tensor_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_div_cell_tensor_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face, boundary_weight, boundary_Sf, boundary_vf, output, sign);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_div_surface_scalar_vol_scalar(cudaStream_t stream, int num_surfaces, 
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *vf, const double *ssf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_ssf, 
+        double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_div_surface_scalar_vol_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
+            lowerAddr, upperAddr, weight, ssf, vf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        fvc_div_surface_scalar_vol_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                patch_size[i], offset, boundary_cell_face,
+                boundary_vf, boundary_ssf, output, sign);
+        (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) ?
+            offset += 2 * patch_size[i] : offset += patch_size[i];
+    }
+}
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output, sign);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated
+                || patch_type[i] == boundaryConditions::cyclic) {
+            fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_weight, boundary_vf, output, sign);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_grad_cell_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, const double *volume, 
+        bool dividVol, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_grad_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr,
+            Sf, weight, vf, output, sign);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated
+                || patch_type[i] == boundaryConditions::cyclic) {
+            fvc_grad_scalar_boundary_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, output, sign);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_grad_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, num_cells, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_weight, boundary_vf, output, sign);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+
+    if (dividVol) {
+        threads_per_block = 1024;
+        blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+        divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+    }
+}
+
+void fvc_grad_cell_scalar_withBC(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
+        int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf,
+        const double *volume, const double *boundary_mag_Sf, double *boundary_output,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        const double *boundary_deltaCoeffs)
+{
+    fvc_grad_cell_scalar(stream, num_cells, num_surfaces, num_boundary_surfaces, lowerAddr, upperAddr, weight, Sf, vf, output,
+            num_patches, patch_size, patch_type, boundary_weight, boundary_cell_face, boundary_vf, boundary_Sf, volume, 1.); // volume is no use here
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+
+    // correct boundary conditions
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            // TODO: just vector version now
+            fvc_grad_cell_scalar_correctBC_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    output, boundary_Sf, boundary_mag_Sf, boundary_output);
+        } else if (patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated) {
+            fvc_grad_cell_scalar_correctBC_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_cells, num_boundary_surfaces,
+                    patch_size[i], offset, boundary_cell_face,
+                    output, vf, boundary_Sf, boundary_mag_Sf, boundary_output, boundary_deltaCoeffs, boundary_vf);
+        } else if (patch_type[i] == boundaryConditions::processor) {
+            correct_boundary_conditions_processor_vector(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    num_boundary_surfaces, num_cells, output, boundary_cell_face, boundary_output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            correct_boundary_conditions_processor_vector(stream, comm, neighbor_peer[i], patch_size[i], offset, 
+                    num_boundary_surfaces, num_cells, output, boundary_cell_face, boundary_output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
+            continue;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            fvc_grad_cell_scalar_correctBC_cyclic<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_cells, num_boundary_surfaces, patch_size[i], offset, patchSizeOffset[cyclicNeighbor[i]],
+                    boundary_cell_face, boundary_weight, output, boundary_output);
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+
+}
+
+void fvc_laplacian_scalar(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *volume,
+        const double *gamma, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_gamma, const double *boundary_vf, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_laplacian_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr,
+            mag_sf, delta_coeffs, gamma, vf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            //fprintf(stderr, "patch_type is zeroGradient\n");
+            // for zeroGradient, boundary_snGrad = 0, thus output += 0
+        } else if (patch_type[i] == boundaryConditions::fixedValue) {
+            //fprintf(stderr, "patch_type is fixedValue\n");
+            // TODO: just vector version now
+            fvc_laplacian_scalar_boundary_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    patch_size[i], offset, boundary_cell_face,
+                    boundary_mag_sf, boundary_delta_coeffs, boundary_gamma, vf, boundary_vf, output, sign);
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_flux(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *Sf, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_vf, const double *boundary_Sf, 
+        double *boundary_output, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_flux_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
+            lowerAddr, upperAddr, vf, weight, Sf, output, sign);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: maybe do not need loop boundarys
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::gradientEnergy) {
+            fvc_flux_boundary_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+                    boundary_Sf, boundary_vf, boundary_output, sign);
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_interpolate(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_vf, double *boundary_output, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_interpolate_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
+            lowerAddr, upperAddr, vf, weight, output, sign);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: maybe do not need loop boundarys
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::gradientEnergy) {
+            fvc_interpolate_boundary_kernel_upCouple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_vf, boundary_output, sign);
+        } else {
+            // xxx
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void fvc_ddt_scalar(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *vf, const double *vf_old, const double *volume, double *source, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_ddt_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf, vf_old, rDeltaT, volume, source, sign);
+}
+
+void fvc_ddt_scalar_field(cudaStream_t stream, int num_cells, double rDeltaT,
+        const double *vf, const double *vf_old, const double *volume, double *source, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    fvc_ddt_scalar_field_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, vf, vf_old, rDeltaT, volume, source, sign);
+}
+
+void fvMtx_A(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *boundary_cell_face, const double *internal_coeffs, const double *volume, const double *diag, 
+        double *A_pEqn)
+{
+    checkCudaErrors(cudaMemcpyAsync(A_pEqn, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coeffs, A_pEqn);
+    
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, A_pEqn);
+}
+
+void fvMtx_H(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume,
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, 
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        double *H_pEqn, double *H_pEqn_perm)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+
+    checkCudaErrors(cudaMemcpyAsync(H_pEqn, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coffs, psi, H_pEqn);
+    
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
+            lower, upper, psi, H_pEqn);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue) {
+            addBoundarySrc_unCoupled<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
+                    num_boundary_surfaces, boundary_cell_face, boundary_coeffs, H_pEqn);
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    divideVol_permute_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, H_pEqn, H_pEqn_perm);
+}
+
+void fvMtx_flux(cudaStream_t stream, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *lower, const double *upper,
+        const double *psi, double *output, //end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *internal_coeffs, const double *boundary_coeffs, 
+        const int *cyclicNeighbor, const int *patchSizeOffset, const double *boundary_psi, double *boundary_output)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    lduMatrix_faceH<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces, lowerAddr, upperAddr, lower, upper, psi, output);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            boundary_flux_couple_process<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face, boundary_psi,
+                    internal_coeffs, boundary_coeffs, boundary_output);
+            offset += 2 * patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            boundary_flux_couple_cyclic<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, patchSizeOffset[cyclicNeighbor[i]],
+                    boundary_cell_face, psi, internal_coeffs, boundary_coeffs, boundary_output);
+            offset += patch_size[i];
+        } else {
+            boundary_flux_unCouple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset, boundary_cell_face, psi, 
+                    internal_coeffs, boundary_output);
+            offset += patch_size[i];
+        }
+    }
+}
+
+void solve_explicit_scalar(cudaStream_t stream, int num_cells, const double *diag, const double *source,
+        double *psi)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+
+    solve_explicit_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, diag, source, psi);
+}
diff --git a/src_gpu/dfNcclBase.H b/src_gpu/dfNcclBase.H
new file mode 100644
index 000000000..11325569a
--- /dev/null
+++ b/src_gpu/dfNcclBase.H
@@ -0,0 +1,34 @@
+#pragma once
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include "nccl.h"
+#include "mpi.h"
+
+#define checkMpiErrors(cmd) do {                          \
+  int e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%d'\n",        \
+        __FILE__,__LINE__, e);   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define checkNcclErrors(cmd) do {                         \
+  ncclResult_t r = cmd;                             \
+  if (r!= ncclSuccess) {                            \
+    printf("Failed, NCCL error %s:%d '%s'\n",             \
+        __FILE__,__LINE__,ncclGetErrorString(r));   \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void ncclInit(MPI_Comm mpi_comm, ncclComm_t& nccl_comm, ncclUniqueId& nccl_id,
+		int *nRanks, int *myRank, int *localRank, int *mpi_init_flag);
+
+void ncclDestroy(ncclComm_t nccl_comm);
+
+// TODO: for temp
+void ncclTest(ncclComm_t nccl_comm);
diff --git a/src_gpu/dfNcclBase.cu b/src_gpu/dfNcclBase.cu
new file mode 100644
index 000000000..66c664aa9
--- /dev/null
+++ b/src_gpu/dfNcclBase.cu
@@ -0,0 +1,97 @@
+#include "dfNcclBase.H"
+#include "dfMatrixDataBase.H"
+
+static uint64_t getHostHash(const char* string) {
+    // Based on DJB2a, result = result * 33 ^ char
+    uint64_t result = 5381;
+    for (int c = 0; string[c] != '\0'; c++){
+        result = ((result << 5) + result) ^ string[c];
+    }
+    return result;
+}
+
+static void getHostName(char* hostname, int maxlen) {
+    gethostname(hostname, maxlen);
+    for (int i=0; i< maxlen; i++) {
+        if (hostname[i] == '.') {
+            hostname[i] = '\0';
+            return;
+        }
+    }
+}
+
+void ncclInit(MPI_Comm mpi_comm, ncclComm_t& nccl_comm, ncclUniqueId& nccl_id,
+        int *pnRanks, int *pmyRank, int *plocalRank, int *p_mpi_init_flag)
+{
+    // check mpi initialized
+    int mpi_init_flag;
+    checkMpiErrors(MPI_Initialized(&mpi_init_flag));
+    if(mpi_init_flag) MPI_Barrier(mpi_comm);
+    else {
+        fprintf(stderr, "MPI is not yet initialized!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    //initializing MPI info
+    int nRanks, myRank, localRank = 0;
+    checkMpiErrors(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
+    checkMpiErrors(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
+
+    //calculating localRank based on hostname which is used in selecting a GPU
+    uint64_t hostHashs[nRanks];
+    char hostname[1024];
+    getHostName(hostname, 1024);
+    hostHashs[myRank] = getHostHash(hostname);
+    checkMpiErrors(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, mpi_comm));
+    for (int p=0; p<nRanks; p++) {
+        if (p == myRank) break;
+        if (hostHashs[p] == hostHashs[myRank]) localRank++;
+    }
+
+    //get NCCL unique ID at rank 0 and broadcast it to all others
+    if (myRank == 0) ncclGetUniqueId(&nccl_id);
+    checkMpiErrors(MPI_Bcast((void *)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, mpi_comm));
+
+    //picking a GPU based on localRank, allocate device buffers
+    checkCudaErrors(cudaSetDevice(localRank));
+
+    //initializing NCCL
+    checkNcclErrors(ncclCommInitRank(&nccl_comm, nRanks, nccl_id, myRank));
+
+    *pnRanks = nRanks;
+    *pmyRank = myRank;
+    *plocalRank = localRank;
+    *p_mpi_init_flag = mpi_init_flag;
+}
+
+void ncclDestroy(ncclComm_t nccl_comm)
+{
+    //finalizing NCCL
+    ncclCommDestroy(nccl_comm);
+}
+
+void ncclTest(ncclComm_t nccl_comm)
+{
+    int size = 32*1024*1024;
+
+    // create buf and stream
+    float *sendbuff, *recvbuff;
+    cudaStream_t s;
+    checkCudaErrors(cudaMalloc(&sendbuff, size * sizeof(float)));
+    checkCudaErrors(cudaMalloc(&recvbuff, size * sizeof(float)));
+    checkCudaErrors(cudaStreamCreate(&s));
+
+    //communicating using NCCL
+    checkNcclErrors(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, ncclSum,
+                nccl_comm, s));
+
+    //completing NCCL operation by synchronizing on the CUDA stream
+    checkCudaErrors(cudaStreamSynchronize(s));
+    usleep(3 * 1000 * 1000);
+
+    //free device buffers
+    checkCudaErrors(cudaFree(sendbuff));
+    checkCudaErrors(cudaFree(recvbuff));
+    checkCudaErrors(cudaStreamDestroy(s));
+}
+
diff --git a/src_gpu/dfRhoEqn.H b/src_gpu/dfRhoEqn.H
index 419072dc6..c38347977 100644
--- a/src_gpu/dfRhoEqn.H
+++ b/src_gpu/dfRhoEqn.H
@@ -1,48 +1,61 @@
 #pragma once
-#include "dfMatrixDataBase.H"
-
-/*
-    fvScalarMatrix rhoEqn
-    (
-        fvm::ddt(rho)
-        + fvc::div(phi)
-    );
 
-    rhoEqn.solve();
-*/
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 
 class dfRhoEqn
 {
 private:
-    dfMatrixDataBase& dataBase_;
-    cudaStream_t stream;
+	dfMatrixDataBase &dataBase_;
 
-    int num_iteration;
-    double time_monitor_CPU;
-    double time_monitor_GPU_kernel, time_monitor_GPU_memcpy, time_monitor_GPU_memcpy_test;
+    // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_instance;
+    bool graph_created=false;
+#endif
 
-    // common variables
-    int num_cells, cell_bytes, num_surfaces, num_boundary_cells;
-    int *d_A_csr_row_index, *d_A_csr_diag_index;
+    // constant fields - boundary
+	std::vector<int> patch_type;
 
-    // Matrix variables
-    double *d_b, *d_psi = nullptr;
-    double *h_b, *h_psi = nullptr;
+    // non-constant fields - ldu
+    double *d_diag = nullptr;
+    double *d_source = nullptr;
 
 public:
-    dfRhoEqn();
-    dfRhoEqn(dfMatrixDataBase& dataBase);
-    ~dfRhoEqn();
+    // constructor
+    dfRhoEqn(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {}
 
-    void initializeTimeStep();
+	// destructor
+	~dfRhoEqn(){}
 
-    void checkValue(bool print);
+    // member function
 
-    void fvc_div(double *phi, double *boundary_phi_init);
+    // initialization
+    void setConstantValues();
+    void setConstantFields(const std::vector<int> patch_type);
+    void initNonConstantFields(const double *rho, const double *phi, 
+            const double *boundary_rho, const double *boudnary_phi);
+    void createNonConstantLduAndCsrFields();
 
-    void fvm_ddt(double *rho_old);
+    // getter function
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 
-    void sync();
+    void cleanCudaResources();
+
+    // run equations
+    void preProcess();
+    void process();
+    void postProcess(double *h_rho);
 
-    void updatePsi(double* Psi);
+    void solve();
+
+    // other functions
+    void compareResult(const double *diag, const double *source, bool printFlag);
+    void compareRho(const double *rho, const double *boundary_rho, bool printFlag);
+    void sync();
+    void correctPsi(const double *rho, const double *boundary_rho);
 };
diff --git a/src_gpu/dfRhoEqn.cu b/src_gpu/dfRhoEqn.cu
index 7c60f3bea..b9eeef762 100644
--- a/src_gpu/dfRhoEqn.cu
+++ b/src_gpu/dfRhoEqn.cu
@@ -1,144 +1,145 @@
 #include "dfRhoEqn.H"
 
-// kernel functions
-__global__ void fvc_div_internal_rho(int num_cells, const int *csr_row_index,
-                                     const int *csr_diag_index, const int *permedIndex, const double *phi_init,
-                                     double *phi_out, const double sign, const double *b_input, double *b_output)
+void dfRhoEqn::createNonConstantLduAndCsrFields()
 {
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double sum = 0;
-
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int permute_index = permedIndex[neighbor_index];
-        double phi = phi_init[permute_index];
-        phi_out[neighbor_index] = phi;
-        sum -= phi;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int permute_index = permedIndex[neighbor_index];
-        double phi = phi_init[permute_index];
-        phi_out[neighbor_index] = phi;
-        sum += phi;
-    }
-
-    b_output[index] = b_input[index] + sum * sign;
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_diag, dataBase_.cell_value_bytes));
+    DEBUG_TRACE;
+#endif
 }
 
-__global__ void fvc_div_boundary_rho(int num_cells, int num_boundary_cells, const int *boundary_cell_offset,
-                                     const int *boundary_cell_id, const int *bouPermedIndex, const double *boundary_phi_init,
-                                     double *boundary_phi, const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double sum = 0;
-
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        int permute_index = bouPermedIndex[i];
-        double phi = boundary_phi_init[permute_index];
-        boundary_phi[i] = phi;
-        sum += phi;
-    }
-
-    b_output[cell_index] = b_input[cell_index] + sum * sign;
+void dfRhoEqn::setConstantValues() {
+    this->stream = dataBase_.stream;
 }
 
-__global__ void fvm_ddt_rho(int num_cells, const double rdelta_t,
-                            const double *rho_old, double *rho_new, const double *volume, const double *b)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+void dfRhoEqn::setConstantFields(const std::vector<int> patch_type) {
+  this->patch_type = patch_type;
+}
 
-    double ddt_diag = rdelta_t * volume[index];
-    double ddt_source = rdelta_t * rho_old[index] * volume[index];
-    double source_sum = ddt_source - b[index];
+void dfRhoEqn::initNonConstantFields(const double *rho, const double *phi, 
+            const double *boundary_rho, const double *boundary_phi) {
+    checkCudaErrors(cudaMemcpy(dataBase_.d_rho, rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_phi, phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_rho, boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_phi, boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
 
-    rho_new[index] = source_sum / ddt_diag;
+void dfRhoEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
+    if (graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance));
+        checkCudaErrors(cudaGraphDestroy(graph));
+    }
+#endif
 }
 
-// constructor
-dfRhoEqn::dfRhoEqn(dfMatrixDataBase &dataBase)
-    : dataBase_(dataBase)
+void dfRhoEqn::preProcess()
 {
-    stream = dataBase_.stream;
-    num_cells = dataBase_.num_cells;
-    cell_bytes = dataBase_.cell_bytes;
-    num_surfaces = dataBase_.num_surfaces;
-    num_boundary_cells = dataBase_.num_boundary_cells;
-
-    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
-    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
-
-    cudaMallocHost(&h_psi, cell_bytes);
-
-    checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes));
 }
 
-void dfRhoEqn::initializeTimeStep()
+void dfRhoEqn::process()
 {
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes, stream));
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_diag, dataBase_.cell_value_bytes, dataBase_.stream));
+#endif
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, dataBase_.h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, dataBase_.h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho, dataBase_.h_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+    checkCudaErrors(cudaMemsetAsync(d_diag, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+
+    fvm_ddt_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho_old, dataBase_.d_volume, 
+            d_diag, d_source);
+    fvc_div_surface_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner,
+            dataBase_.d_neighbor, dataBase_.d_phi, dataBase_.d_boundary_face_cell, 
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+            dataBase_.d_boundary_phi, dataBase_.d_volume, d_source, -1);
+    solve();
+    correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+            dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
+            patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell, dataBase_.d_rho, dataBase_.d_boundary_rho,
+            dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#endif
+    TICK_END_EVENT(rhoEqn process);
+
+    TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_diag, dataBase_.stream));
+#endif
+    TICK_END_EVENT(rhoEqn post process free);
+    TICK_START_EVENT;
+    // checkCudaErrors(cudaMemcpyAsync(dataBase_.h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    TICK_END_EVENT(rhoEqn post process copy back);
+    sync();
 }
 
-void dfRhoEqn::fvc_div(double *phi, double *boundary_phi_init)
+void dfRhoEqn::sync()
 {
-    memcpy(dataBase_.h_phi_init, phi, num_surfaces * sizeof(double));
-
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init, dataBase_.h_phi_init, num_surfaces * sizeof(double), cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi_init + num_surfaces, dataBase_.d_phi_init, num_surfaces * sizeof(double), cudaMemcpyDeviceToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi_init, boundary_phi_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_internal_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_permedIndex,
-                                                                            dataBase_.d_phi_init, dataBase_.d_phi, 1., d_b, d_b);
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
 
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_boundary_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                            dataBase_.d_bouPermedIndex, dataBase_.d_boundary_phi_init, dataBase_.d_boundary_phi, 1., d_b, d_b);
+void dfRhoEqn::solve()
+{
+    solve_explicit_scalar(dataBase_.stream, dataBase_.num_cells, d_diag, d_source, dataBase_.d_rho);
 }
 
-void dfRhoEqn::fvm_ddt(double *rho_old)
+void dfRhoEqn::postProcess(double *h_rho) {}
+
+void dfRhoEqn::compareResult(const double *diag, const double *source, bool printFlag)
 {
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho_old, rho_old, cell_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_ddt_rho<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.rdelta_t, dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, d_b);
-    checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_rho_new, cell_bytes, cudaMemcpyDeviceToHost, stream));
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag\n");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source;
+    h_source.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source\n");
+    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 }
 
-void dfRhoEqn::sync()
+void dfRhoEqn::compareRho(const double *rho, const double *boundary_rho, bool printFlag)
 {
-    checkCudaErrors(cudaStreamSynchronize(stream));
+    std::vector<double> h_rho;
+    h_rho.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_rho.data(), dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_rho\n");
+    checkVectorEqual(dataBase_.num_cells, rho, h_rho.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_rho;
+    h_boundary_rho.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_rho.data(), dataBase_.d_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_rho\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rho, h_boundary_rho.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
 }
 
-void dfRhoEqn::updatePsi(double *Psi)
+void dfRhoEqn::correctPsi(const double *rho, const double *boundary_rho)
 {
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    for (size_t i = 0; i < num_cells; i++)
-        Psi[i] = h_psi[i];
+    checkCudaErrors(cudaMemcpy(dataBase_.d_rho, rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_rho, boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
 }
-dfRhoEqn::~dfRhoEqn(){}
+
diff --git a/src_gpu/dfThermo.H b/src_gpu/dfThermo.H
new file mode 100644
index 000000000..37160cab6
--- /dev/null
+++ b/src_gpu/dfThermo.H
@@ -0,0 +1,123 @@
+#pragma once
+
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+
+void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::vector<std::vector<double>>& viscosity_coeffs,
+        std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
+        std::vector<double>& molecular_weights);
+
+class dfThermo
+{
+    dfMatrixDataBase &dataBase_;
+
+    // private data members
+    std::string mechanism_file;
+    std::string thermo_coeff_file;
+
+    // private member functions
+    void readCoeffsBinary(FILE* fp, int dimension, std::vector<std::vector<double>>& coeffs);
+    void initCoeffsfromBinaryFile(FILE* fp);
+
+public:
+    // cuda resource
+    cudaStream_t stream;
+
+    // public data members
+    int num_species;
+    int num_cells;
+
+    // thermo coeffs
+    std::vector<std::vector<double>> nasa_coeffs;
+    std::vector<std::vector<double>> viscosity_coeffs;
+    std::vector<std::vector<double>> thermal_conductivity_coeffs;
+    std::vector<std::vector<double>> binary_diffusion_coeffs;
+    std::vector<double> molecular_weights;
+
+    // species info
+    std::vector<double> mass_fraction;
+    std::vector<double> mole_fraction;
+    double meanMolecularWeight;
+
+    double *d_mole_fraction, *d_mean_mole_weight;
+    double *d_boundary_mole_fraction, *d_boundary_mean_mole_weight;
+
+    // intermediate variables
+    std::vector<double> T_poly;
+
+    double *d_T_poly, *d_boundary_T_poly;
+    double *d_species_viscosities, *d_boundary_species_viscosities;
+    double *d_species_thermal_conductivities, *d_boundary_species_thermal_conductivities;
+    double *d_psip0, *d_boundary_psip0;
+
+    // constructor
+    dfThermo(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {};
+
+    // destructor
+    ~dfThermo(){};
+
+    void cleanCudaResources();
+
+    // public member functions
+    void setConstantValue(std::string mechanism_file, int num_cells, int num_species);
+    void setConstantFields(const std::vector<int> patch_type);
+    void initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, 
+            const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, 
+            const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, 
+            const double *h_boundary_k, const double *h_boundary_rhoD);
+
+    // set mass fraction
+    void setMassFraction(const double *d_y, const double *d_boundary_y);
+
+    // *** GPU functions ***
+    void calculateTPolyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *T_poly, int offset = 0);
+    void calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, 
+            double *d_psi, int offset = 0);
+    void calculateRhoGPU(int thread_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset = 0);
+    void calculateViscosityGPU(int num_thread, int num_total, const double *T, const double *mole_fraction,
+            const double *T_poly, double *species_viscosities, double *viscosity, int offset = 0);
+    void calculateThermoConductivityGPU(int thread_per_block, int num_thread, int num_total, const double *T, const double *T_poly,
+            const double *d_y, const double *mole_fraction, double *species_thermal_conductivities,
+            double *thermal_conductivity, int offset = 0);
+    void calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
+            const double *T_poly, const double *p, const double *mole_fraction, 
+            const double *mean_mole_weight, const double *rho, double *rhoD, int offset = 0);
+    void calculateEnthalpyGPU(int thread_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset = 0);
+    void calculateTemperatureGPU(int thread_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, 
+            double *T, const double *d_mass_fraction, int offset = 0,
+            double atol = 1e-7, double rtol = 1e-7, int max_iter = 20);
+    void calculateEnergyGradient(int num_thread, int num_cells, int num_species, 
+            int num_boundary_surfaces, int bou_offset, int gradient_offset, const int *face2Cells, 
+            const double *T, const double *p, const double *y, const double *boundary_delta_coeffs,
+            const double *boundary_p, const double *boundary_y, double *boundary_thermo_gradient);
+    void setPsip0(int thread_per_block, int num_thread, const double *p, const double *psi, double *psip0, int offset = 0);
+    void addPsipRho(int thread_per_block, int num_thread, const double *p, const double *psi, const double *psip0, 
+            double *rho, int offset = 0);
+    void updateCPUT(double *h_T, double *h_boundary_T);
+
+    void compareT(const double *T, const double *boundary_T, bool printFlag);
+    void compareRho(const double *rho, const double *boundary_rho, bool printFlag);
+    void comparePsi(const double *psi, const double *boundary_psi, bool printFlag);
+    void compareMu(const double *mu, const double *boundary_mu, bool printFlag);
+    void compareAlpha(const double *alpha, const double *boundary_alpha, bool printFlag);
+    void compareHe(const double *he, const double *boundary_he, bool printFlag);
+    void compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag);
+
+    void correctHe(const double *he, const double *boundary_he);
+    void correctPsi(const double *psi, const double *boundary_psi);
+    void correctAlpha(const double *alpha, const double *boundary_alpha);
+    void correctMu(const double *mu, const double *boundary_mu);
+    void correctRho(const double *rho, const double *boundary_rho);
+
+    void sync();
+
+    // outer API
+    void updateEnergy();
+    void correctThermo();
+    void updateRho();
+    void psip0();
+    void correctPsipRho();
+
+    // getter functions
+};
diff --git a/src_gpu/dfThermo.cu b/src_gpu/dfThermo.cu
new file mode 100644
index 000000000..6d8dae14d
--- /dev/null
+++ b/src_gpu/dfThermo.cu
@@ -0,0 +1,878 @@
+#include "dfThermo.H"
+#include <filesystem>
+#include <cmath>
+#include <numeric>
+#include <cassert>
+#include <cstring>
+#include "device_launch_parameters.h"
+
+#define GAS_CANSTANT 8314.46261815324
+#define SQRT8 2.8284271247461903
+#define NUM_SPECIES 7
+
+// constant memory
+__constant__ __device__ double d_nasa_coeffs[NUM_SPECIES*15];
+__constant__ __device__ double d_viscosity_coeffs[NUM_SPECIES*5];
+__constant__ __device__ double d_thermal_conductivity_coeffs[NUM_SPECIES*5];
+__constant__ __device__ double d_binary_diffusion_coeffs[NUM_SPECIES*NUM_SPECIES*5];
+__constant__ __device__ double d_molecular_weights[NUM_SPECIES];
+__constant__ __device__ double d_viscosity_conatant1[NUM_SPECIES*NUM_SPECIES];
+__constant__ __device__ double d_viscosity_conatant2[NUM_SPECIES*NUM_SPECIES];
+
+void init_const_coeff_ptr(std::vector<std::vector<double>>& nasa_coeffs, std::vector<std::vector<double>>& viscosity_coeffs,
+        std::vector<std::vector<double>>& thermal_conductivity_coeffs, std::vector<std::vector<double>>& binary_diffusion_coeffs,
+        std::vector<double>& molecular_weights)
+{
+    //double *d_tmp;
+    //checkCudaErrors(cudaMalloc((void**)&d_tmp, sizeof(double) * NUM_SPECIES * 15));
+    double nasa_coeffs_tmp[NUM_SPECIES*15];
+    double viscosity_coeffs_tmp[NUM_SPECIES*5];
+    double thermal_conductivity_coeffs_tmp[NUM_SPECIES*5];
+    double binary_diffusion_coeffs_tmp[NUM_SPECIES*NUM_SPECIES*5];
+    double viscosity_conatant1_tmp[NUM_SPECIES*NUM_SPECIES];
+    double viscosity_conatant2_tmp[NUM_SPECIES*NUM_SPECIES];
+
+    for (int i = 0; i < NUM_SPECIES; i++) {
+        std::copy(nasa_coeffs[i].begin(), nasa_coeffs[i].end(), nasa_coeffs_tmp + i * 15);
+        std::copy(viscosity_coeffs[i].begin(), viscosity_coeffs[i].end(), viscosity_coeffs_tmp + i * 5);
+        std::copy(thermal_conductivity_coeffs[i].begin(), thermal_conductivity_coeffs[i].end(), thermal_conductivity_coeffs_tmp + i * 5);
+        std::copy(binary_diffusion_coeffs[i].begin(), binary_diffusion_coeffs[i].end(), binary_diffusion_coeffs_tmp + i * 5 * NUM_SPECIES);
+        for (int j = 0; j < NUM_SPECIES; j++) {
+            viscosity_conatant1_tmp[i * NUM_SPECIES + j] = pow((1 + molecular_weights[i] / molecular_weights[j]), -0.5);
+            viscosity_conatant2_tmp[i * NUM_SPECIES + j] = pow(molecular_weights[j] / molecular_weights[i], 0.25);
+        }
+    }
+    checkCudaErrors(cudaMemcpyToSymbol(d_nasa_coeffs, nasa_coeffs_tmp, sizeof(double) * 15 * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_viscosity_coeffs, viscosity_coeffs_tmp, sizeof(double) * 5 * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_thermal_conductivity_coeffs, thermal_conductivity_coeffs_tmp, sizeof(double) * 5 * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_binary_diffusion_coeffs, binary_diffusion_coeffs_tmp, sizeof(double) * 5 * NUM_SPECIES * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_molecular_weights, molecular_weights.data(), sizeof(double) * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_viscosity_conatant1, viscosity_conatant1_tmp, sizeof(double) * NUM_SPECIES * NUM_SPECIES));
+    checkCudaErrors(cudaMemcpyToSymbol(d_viscosity_conatant2, viscosity_conatant2_tmp, sizeof(double) * NUM_SPECIES * NUM_SPECIES));
+}
+
+__global__ void get_mole_fraction_mean_mole_weight(int num_cells, int num_species, const double *d_Y, 
+        double *mole_fraction, double *mean_mole_weight)
+{   
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    double sum = 0.;
+    for (int i = 0; i < num_species; i++) {
+        sum += d_Y[i * num_cells + index] / d_molecular_weights[i];
+    }
+    double meanMoleWeight = 0.;
+    for (int i = 0; i < num_species; i++) {
+        mole_fraction[i * num_cells + index] = d_Y[i * num_cells + index] / (d_molecular_weights[i] * sum);
+        meanMoleWeight += mole_fraction[i * num_cells + index] * d_molecular_weights[i];
+    }
+    mean_mole_weight[index] = meanMoleWeight;
+}
+
+__global__ void calculate_TPoly_kernel(int num_thread, int num_total, const double *T, double *d_T_poly, int offset)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+    
+    d_T_poly[num_total * 0 + startIndex] = 1.0;
+    d_T_poly[num_total * 1 + startIndex] = log(T[startIndex]);
+    d_T_poly[num_total * 2 + startIndex] = d_T_poly[num_total * 1 + startIndex] * d_T_poly[num_total * 1 + startIndex];
+    d_T_poly[num_total * 3 + startIndex] = d_T_poly[num_total * 1 + startIndex] * d_T_poly[num_total * 2 + startIndex];
+    d_T_poly[num_total * 4 + startIndex] = d_T_poly[num_total * 2 + startIndex] * d_T_poly[num_total * 2 + startIndex];
+}
+
+__global__ void calculate_psi_kernel(int num_cells, int offset, const double *T, const double *mean_mole_weight,
+        double *psi)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+    
+    int startIndex = index + offset;
+    
+    psi[startIndex] = mean_mole_weight[startIndex] / (GAS_CANSTANT * T[startIndex]);
+}
+
+__global__ void calculate_rho_kernel(int num_thread, int offset, const double *p, const double *psi, double *rho)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+    
+    rho[startIndex] = p[startIndex] * psi[startIndex];
+}
+
+__global__ void calculate_viscosity_kernel(int num_thread, int num_total, int num_species, int offset,
+        const double *T_poly, const double *T, const double *mole_fraction,
+        double *species_viscosities, double *mu)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    extern __shared__ double sdata[];
+    double* sv = sdata;
+    double* mf = &sdata[blockDim.x * num_species];
+
+    int startIndex = index + offset;
+
+    double sqrt_local_T = sqrt(T[startIndex]);
+
+    double poly[5];
+    for (int j = 0; j < 5; j++) {
+        poly[j] = T_poly[num_total * j + startIndex];
+    }
+
+    for (int i = 0; i < num_species; i++) {
+        double dot_product = 0.;
+        for (int j = 0; j < 5; j++) {
+            dot_product += d_viscosity_coeffs[i * 5 + j] * poly[j];
+        }
+        sv[threadIdx.x * num_species + i] = dot_product;
+        mf[threadIdx.x * num_species + i] = mole_fraction[num_total * i + startIndex];
+    }
+    double mu_mix = 0.;
+    for (int i = 0; i < num_species; i++) {
+        double sum = 0.;
+        for (int j = 0; j < num_species; j++) {
+            double temp = 1.0 + (sv[threadIdx.x * num_species + i] / sv[threadIdx.x * num_species + j]) *
+                          d_viscosity_conatant2[i * NUM_SPECIES + j];
+            sum += mf[threadIdx.x * num_species + j] / SQRT8 * d_viscosity_conatant1[i * NUM_SPECIES + j] * (temp * temp);
+        }
+        mu_mix += mf[threadIdx.x * num_species + i] * (sv[threadIdx.x * num_species + i] * sv[threadIdx.x * num_species + i]) / sum;
+    }
+    mu[startIndex] = mu_mix * sqrt_local_T;
+}
+
+__device__ double calculate_cp_device_kernel(int num_total, int num_species, int index, 
+        const double local_T, const double *mass_fraction)
+{   
+    double cp = 0.;
+
+    for (int i = 0; i < num_species; i++) {
+        if (local_T > d_nasa_coeffs[i * 15 + 0]) {
+            cp += mass_fraction[i * num_total + index] * (d_nasa_coeffs[i * 15 + 1] + d_nasa_coeffs[i * 15 + 2] * local_T + d_nasa_coeffs[i * 15 + 3] * local_T * local_T + 
+                    d_nasa_coeffs[i * 15 + 4] * local_T * local_T * local_T + 
+                    d_nasa_coeffs[i * 15 + 5] * local_T * local_T * local_T * local_T) * GAS_CANSTANT / d_molecular_weights[i];
+        } else {
+            cp += mass_fraction[i * num_total + index] * (d_nasa_coeffs[i * 15 + 8] + d_nasa_coeffs[i * 15 + 9] * local_T + d_nasa_coeffs[i * 15 + 10] * local_T * local_T + 
+                    d_nasa_coeffs[i * 15 + 11] * local_T * local_T * local_T + 
+                    d_nasa_coeffs[i * 15 + 12] * local_T * local_T * local_T * local_T) * GAS_CANSTANT / d_molecular_weights[i];
+        }
+    }
+    return cp;
+}
+
+__global__ void calculate_thermoConductivity_kernel(int num_thread, int num_total, int num_species, 
+        int offset, const double *nasa_coeffs, const double *mass_fraction,
+        const double *T_poly, const double *T, const double *mole_fraction,
+        double *species_thermal_conductivities, double *thermal_conductivity)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+
+    int startIndex = offset + index;
+
+    double dot_product;
+    double local_T = T[startIndex];
+
+    for (int i = 0; i < num_species; i++) {
+        dot_product = 0.;
+        for (int j = 0; j < 5; j++) {
+            dot_product += d_thermal_conductivity_coeffs[i * 5 + j] * T_poly[num_total * j + startIndex];
+        }
+        species_thermal_conductivities[i * num_total + startIndex] = dot_product * sqrt(local_T);
+    }
+
+    double sum_conductivity = 0.;
+    double sum_inv_conductivity = 0.;
+
+    for (int i = 0; i < num_species; i++) {
+        sum_conductivity += mole_fraction[num_total * i + startIndex] * species_thermal_conductivities[i * num_total + startIndex];
+        sum_inv_conductivity += mole_fraction[num_total * i + startIndex] / species_thermal_conductivities[i * num_total + startIndex];
+    }
+    double lambda_mix = 0.5 * (sum_conductivity + 1.0 / sum_inv_conductivity);
+
+    double cp = calculate_cp_device_kernel(num_total, num_species, startIndex, local_T, mass_fraction);
+
+    thermal_conductivity[startIndex] = lambda_mix / cp;
+}
+
+__global__ void calculate_diffusion_kernel(int num_thread, int num_total, int num_species,
+        int offset, const double *T_poly, const double *mole_fraction, const double *p,
+        const double *mean_mole_weight, const double *rho, const double *T, double *d)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    extern __shared__ double shared_data[];
+    double *mole_fraction_shared = shared_data;
+    
+    int startIndex = offset + index;
+
+    for (int i = 0; i < num_species; i++) {
+        mole_fraction_shared[i * blockDim.x + threadIdx.x] = mole_fraction[i * num_total + startIndex];
+    }
+
+    double poly[5];
+    for (int j = 0; j < 5; j++) {
+        poly[j] = T_poly[num_total * j + startIndex];
+    }
+    
+    double powT = T[startIndex] * sqrt(T[startIndex]);
+
+    double local_mean_mole_weight = mean_mole_weight[startIndex];
+    double local_rho_div_p = rho[startIndex] / p[startIndex];
+    for (int i = 0; i < num_species; i++) {
+        if (mole_fraction_shared[i * blockDim.x + threadIdx.x] + 1e-10 > 1.) {
+            d[num_total * i + startIndex] = 0.;
+            continue;
+        }
+        double sum1 = 0.;
+        double sum2 = 0.;
+        for (int j = 0; j < num_species; j++) {
+            if (i == j) continue;
+            // calculate D
+            double tmp = 0.;
+            for (int k = 0; k < 5; k++)
+                tmp += (d_binary_diffusion_coeffs[i * num_species * 5 + j * 5 + k] * poly[k]);
+            double local_D = tmp * powT;
+            sum1 += mole_fraction_shared[j * blockDim.x + threadIdx.x] / local_D;
+            sum2 += mole_fraction_shared[j * blockDim.x + threadIdx.x] * d_molecular_weights[j] / local_D;
+        }
+        sum2 *= mole_fraction_shared[i * blockDim.x + threadIdx.x] / 
+                (local_mean_mole_weight - mole_fraction_shared[i * blockDim.x + threadIdx.x] * d_molecular_weights[i]);
+        d[num_total * i + startIndex] = 1 / (sum1 + sum2) * local_rho_div_p;
+    }
+}
+
+__device__ double calculate_enthalpy_device_kernel(int num_total, int num_species, int index, const double local_T,
+        const double *mass_fraction)
+{
+    double h = 0.;
+
+    for (int i = 0; i < num_species; i++) {
+        if (local_T > d_nasa_coeffs[i * 15 + 0]) {
+            h += (d_nasa_coeffs[i * 15 + 1] + d_nasa_coeffs[i * 15 + 2] * local_T / 2 + d_nasa_coeffs[i * 15 + 3] * local_T * local_T / 3 + 
+                    d_nasa_coeffs[i * 15 + 4] * local_T * local_T * local_T / 4 + d_nasa_coeffs[i * 15 + 5] * local_T * local_T * local_T * local_T / 5 + 
+                    d_nasa_coeffs[i * 15 + 6] / local_T) * GAS_CANSTANT * local_T / d_molecular_weights[i] * mass_fraction[i * num_total + index];
+        } else {
+            h += (d_nasa_coeffs[i * 15 + 8] + d_nasa_coeffs[i * 15 + 9] * local_T / 2 + d_nasa_coeffs[i * 15 + 10] * local_T * local_T / 3 + 
+                    d_nasa_coeffs[i * 15 + 11] * local_T * local_T * local_T / 4 + d_nasa_coeffs[i * 15 + 12] * local_T * local_T * local_T * local_T / 5 + 
+                    d_nasa_coeffs[i * 15 + 13] / local_T) * GAS_CANSTANT * local_T / d_molecular_weights[i] * mass_fraction[i * num_total + index];
+        }
+    }
+    return h;
+}
+
+__global__ void calculate_energy_gradient_kernel(int num_thread, int num_cells, int num_species, 
+        int num_boundary_surfaces, int bou_offset, int gradient_offset,
+        const int *face2Cells, const double *T, const double *p, const double *y,
+        const double *boundary_p, const double *boundary_y, const double *boundary_delta_coeffs,
+        double *boundary_energy_gradient)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int bou_start_index = index + bou_offset;
+    int gradient_start_index = index + gradient_offset;
+    int cellIndex = face2Cells[bou_start_index];
+    
+    double local_T = T[cellIndex];
+    double h_internal = calculate_enthalpy_device_kernel(num_cells, num_species, cellIndex, local_T, y);
+    double h_boundary = calculate_enthalpy_device_kernel(num_boundary_surfaces, num_species, bou_start_index, local_T, boundary_y);
+    boundary_energy_gradient[gradient_start_index] = (h_boundary - h_internal) * boundary_delta_coeffs[bou_start_index];
+}
+
+__global__ void calculate_temperature_kernel(int num_thread, int num_total, int num_species, int offset,
+        const double *T_init, const double *h_target, const double *mass_fraction,
+        double *T_est, double atol, double rtol, int max_iter)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+
+    double local_T = T_init[startIndex];
+    double local_h_target = h_target[startIndex];
+    double h, cp, delta_T;
+    for (int n = 0; n < max_iter; ++n) {
+        h = calculate_enthalpy_device_kernel(num_total, num_species, startIndex, local_T, mass_fraction);
+        cp = calculate_cp_device_kernel(num_total, num_species, startIndex, local_T, mass_fraction);
+        delta_T = (h - local_h_target) / cp;
+        local_T -= delta_T;
+        if (fabs(h - local_h_target) < atol || fabs(delta_T / local_T) < rtol) {
+            break;
+        }
+    }
+    
+    T_est[startIndex] = local_T;
+}
+
+extern void __global__ correct_internal_boundary_field_scalar(int num, int offset,
+        const double *vf_internal, const int *face2Cells, double *vf_boundary);
+
+__global__ void calculate_enthalpy_kernel(int num_thread, int offset, int num_total, int num_species, 
+        const double *T, const double *mass_fraction, double *enthalpy)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+    
+    enthalpy[startIndex] = calculate_enthalpy_device_kernel(num_total, num_species, startIndex, T[startIndex], mass_fraction);
+}
+
+__global__ void calculate_psip0_kernel(int num_thread, int offset, const double *p, const double *psi, double *psip0)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+    
+    psip0[startIndex] = p[startIndex] * psi[startIndex];
+}
+
+__global__ void add_psip_rho_kernel(int num_thread, int offset, const double *p, const double *psi, const double *psip0, double *rho)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_thread)
+        return;
+    
+    int startIndex = index + offset;
+
+    rho[startIndex] += p[startIndex] * psi[startIndex] - psip0[startIndex];
+}
+
+void dfThermo::cleanCudaResources() {}
+
+void dfThermo::setConstantValue(std::string mechanism_file, int num_cells, int num_species)
+{
+    this->mechanism_file = mechanism_file;
+    this->num_cells = num_cells;
+    this->num_species = num_species;
+    // get thermo_coeff_file from mechanism_file
+    std::string prefix = "thermo_";
+    std::string suffix = ".txt";
+    std::string baseName = std::filesystem::path(mechanism_file).stem().string();
+    thermo_coeff_file = prefix + baseName + suffix;
+
+    // check if thermo_coeff_file exists
+    if (!std::filesystem::exists(thermo_coeff_file))
+    {
+        std::cout << "Thermo coefficient file does not exist!" << std::endl;
+        exit(1);
+    }
+
+    // read binary file
+    FILE *fp = NULL;
+    char *c_thermo_file = new char[thermo_coeff_file.length() + 1];
+    strcpy(c_thermo_file, thermo_coeff_file.c_str());
+
+    fp = fopen(c_thermo_file, "rb+");
+    if (fp == NULL) {
+        fprintf(stderr, "Failed to open input file: %s!\n", c_thermo_file);
+        exit(EXIT_FAILURE);
+    }
+
+    fread(&num_species, sizeof(int), 1, fp);
+
+    molecular_weights.resize(num_species);
+    fread(molecular_weights.data(), sizeof(double), num_species, fp);
+
+    mass_fraction.resize(num_species);
+    mole_fraction.resize(num_species);
+
+    initCoeffsfromBinaryFile(fp);
+
+    stream = dataBase_.stream;
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
+    checkCudaErrors(cudaMalloc((void**)&d_mean_mole_weight, sizeof(double) * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces));
+    checkCudaErrors(cudaMalloc((void**)&d_T_poly, sizeof(double) * 5 * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces));
+
+    checkCudaErrors(cudaMalloc((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
+    checkCudaErrors(cudaMalloc((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces));
+#endif
+    
+    checkCudaErrors(cudaMalloc((void**)&d_psip0, sizeof(double) * num_cells));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_psip0, sizeof(double) * dataBase_.num_boundary_surfaces));
+    std::cout << "dfThermo initialized" << std::endl;
+}
+
+void dfThermo::readCoeffsBinary(FILE* fp, int dimension, std::vector<std::vector<double>>& coeffs)
+{
+    coeffs.resize(num_species);
+    for (int i = 0; i < num_species; i++) {
+        coeffs[i].resize(dimension);
+        fread(coeffs[i].data(), sizeof(double), dimension, fp);
+    }
+}
+
+void dfThermo::initCoeffsfromBinaryFile(FILE* fp)
+{
+    readCoeffsBinary(fp, 15, nasa_coeffs);
+    readCoeffsBinary(fp, 5, viscosity_coeffs);
+    readCoeffsBinary(fp, 5, thermal_conductivity_coeffs);
+    readCoeffsBinary(fp, num_species * 5, binary_diffusion_coeffs);
+}
+
+void dfThermo::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(stream));
+}
+
+void dfThermo::setMassFraction(const double *d_y, const double *d_boundary_y)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+
+    get_mole_fraction_mean_mole_weight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_species, 
+            d_y, d_mole_fraction, d_mean_mole_weight);
+    
+    blocks_per_grid = (dataBase_.num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    get_mole_fraction_mean_mole_weight<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_surfaces, num_species, 
+            d_boundary_y, d_boundary_mole_fraction, d_boundary_mean_mole_weight);
+}
+
+void dfThermo::setConstantFields(const std::vector<int> patch_type) 
+{
+    dataBase_.patch_type_T = patch_type;
+}
+
+void dfThermo::initNonConstantFields(const double *h_T, const double *h_he, const double *h_psi, const double *h_alpha, 
+        const double *h_mu, const double *h_k, const double *h_dpdt, const double *h_rhoD, const double *h_boundary_T, 
+        const double *h_boundary_he, const double *h_boundary_psi, const double *h_boundary_alpha, const double *h_boundary_mu, 
+        const double *h_boundary_k, const double *h_boundary_rhoD)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_T, h_T, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_he, h_he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_psi, h_psi, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_alpha, h_alpha, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_mu, h_mu, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_k, h_k, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_dpdt, h_dpdt, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_rhoD, h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice));
+
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_T, h_boundary_T, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_he, h_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_psi, h_boundary_psi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_alpha, h_boundary_alpha, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_mu, h_boundary_mu, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_k, h_boundary_k, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_rhoD, h_boundary_rhoD, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice));
+}
+
+void dfThermo::calculateTPolyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *T_poly, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+
+    calculate_TPoly_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, T, T_poly, offset);
+}
+
+void dfThermo::calculatePsiGPU(int threads_per_block, int num_thread, const double *T, const double *mean_mole_weight, 
+        double *d_psi, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    calculate_psi_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, T, mean_mole_weight, d_psi);
+}
+
+void dfThermo::calculateRhoGPU(int threads_per_block, int num_thread, const double *p, const double *psi, double *rho, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    calculate_rho_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, p, psi, rho);
+}
+
+void dfThermo::calculateViscosityGPU(int num_thread, int num_total, const double *T, const double *mole_fraction,
+        const double *T_poly, double *species_viscosities, double *viscosity, int offset)
+{
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    int threads_per_block = 32;
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    size_t num_bytes_dyn_shm = threads_per_block * num_species * 2 * sizeof(double);
+    calculate_viscosity_kernel<<<blocks_per_grid, threads_per_block, num_bytes_dyn_shm, stream>>>(num_thread, num_total, num_species, offset,
+            T_poly, T, mole_fraction, species_viscosities, viscosity);
+    TICK_END_EVENT(calculate_viscosity_kernel);
+}
+
+void dfThermo::calculateThermoConductivityGPU(int threads_per_block, int num_thread, int num_total, const double *T, 
+        const double *T_poly, const double *d_y, const double *mole_fraction, double *species_thermal_conductivities,
+        double *thermal_conductivity, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    calculate_thermoConductivity_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, 
+            offset, d_nasa_coeffs, d_y, T_poly, T, mole_fraction, species_thermal_conductivities, thermal_conductivity);
+}
+
+void dfThermo::calculateRhoDGPU(int threads_per_block, int num_thread, int num_total, const double *T,
+        const double *T_poly, const double *p, const double *mole_fraction,
+        const double *mean_mole_weight, const double *rho, double *rhoD, int offset)
+{
+    threads_per_block = 32;
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    size_t sharedMemSize = sizeof(double) * threads_per_block * num_species;
+
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    calculate_diffusion_kernel<<<blocks_per_grid, threads_per_block, sharedMemSize, stream>>>(num_thread, num_total, num_species, offset,
+            T_poly, mole_fraction, p, mean_mole_weight, rho, T, rhoD);
+    TICK_END_EVENT("calculate_diffusion_kernel");
+}
+
+void dfThermo::calculateTemperatureGPU(int threads_per_block, int num_thread, int num_total, const double *T_init, const double *target_h, double *T, 
+        const double *d_mass_fraction, int offset, double atol, double rtol, int max_iter)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+
+    calculate_temperature_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_total, num_species, offset,
+            T_init, target_h, d_mass_fraction, T, atol, rtol, max_iter);
+}
+
+void dfThermo::calculateEnthalpyGPU(int threads_per_block, int num_thread, int num_total, const double *T, double *enthalpy, const double *d_mass_fraction, int offset)
+{
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+
+    calculate_enthalpy_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, offset, num_total, num_species, 
+            T, d_mass_fraction, enthalpy);
+}
+
+void dfThermo::updateEnergy()
+{
+    calculateEnthalpyGPU(1024, num_cells, num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_y);
+    
+    // int offset = 0;
+    // for (int i = 0; i < dataBase_.num_patches; i++) {
+    //     calculateEnthalpyGPU(dataBase_.patch_size[i], dataBase_.d_boundary_T + offset, dataBase_.d_he + offset, dataBase_.d_y, offset);
+    //     if (dataBase_.patch_type_T[i] == boundaryConditions::processor) {
+    //         offset += 2 * dataBase_.patch_size[i];
+    //     } else {
+    //         offset += dataBase_.patch_size[i];
+    //     }
+    // }
+}
+
+void dfThermo::correctThermo()
+{
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMallocAsync((void**)&d_mole_fraction, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mole_fraction, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_mean_mole_weight, sizeof(double) * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mean_mole_weight, sizeof(double) * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_T_poly, sizeof(double) * 5 * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_T_poly, sizeof(double) * 5 * dataBase_.num_boundary_surfaces, stream));
+
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_viscosities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_viscosities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_species_thermal_conductivities, sizeof(double) * num_species * num_cells, stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_species_thermal_conductivities, sizeof(double) * num_species * dataBase_.num_boundary_surfaces, stream));
+#endif
+ 
+    setMassFraction(dataBase_.d_y, dataBase_.d_boundary_y);
+    // internal field
+    int cell_thread = 512, boundary_thread = 32;
+    fprintf(stderr, "\n\n");
+    calculateTemperatureGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, dataBase_.d_he, dataBase_.d_T, dataBase_.d_y); // calculate temperature
+    calculateTPolyGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly); // calculate T_poly
+    calculatePsiGPU(cell_thread, dataBase_.num_cells, dataBase_.d_T, d_mean_mole_weight, dataBase_.d_thermo_psi); // calculate psi
+    calculateRhoGPU(cell_thread, dataBase_.num_cells, dataBase_.d_p, dataBase_.d_thermo_psi, dataBase_.d_rho); // calculate rho
+    calculateViscosityGPU(dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_mole_fraction,
+            d_T_poly, d_species_viscosities, dataBase_.d_mu); // calculate viscosity
+    calculateThermoConductivityGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_y, d_mole_fraction, 
+            d_species_thermal_conductivities, dataBase_.d_thermo_alpha); // calculate thermal conductivity
+    calculateRhoDGPU(cell_thread, dataBase_.num_cells, dataBase_.num_cells, dataBase_.d_T, d_T_poly, dataBase_.d_p, d_mole_fraction, 
+            d_mean_mole_weight, dataBase_.d_rho, dataBase_.d_thermo_rhoD); 
+    fprintf(stderr, "\n\n");
+    // boundary field
+    int offset = 0;
+    for (int i = 0; i < dataBase_.num_patches; i++) {
+        if (dataBase_.patch_size[i] == 0) continue;
+        if (dataBase_.patch_type_T[i] == boundaryConditions::fixedValue) {
+            calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
+            calculateEnthalpyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
+                    dataBase_.d_boundary_y, offset);
+            calculatePsiGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, d_boundary_mean_mole_weight, dataBase_.d_boundary_thermo_psi, offset);
+            calculateRhoGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_p, dataBase_.d_boundary_thermo_psi, dataBase_.d_boundary_rho, offset);
+            calculateViscosityGPU(dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_mole_fraction,
+                    d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset);
+            calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, 
+                    dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset);
+            calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
+                    dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset);
+        } else {
+            calculateTemperatureGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, dataBase_.d_boundary_he, 
+                    dataBase_.d_boundary_T, dataBase_.d_boundary_y, offset);
+            calculateTPolyGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly, offset);
+            calculatePsiGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_T, d_boundary_mean_mole_weight, dataBase_.d_boundary_thermo_psi, offset);
+            calculateRhoGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.d_boundary_p, dataBase_.d_boundary_thermo_psi, dataBase_.d_boundary_rho, offset);
+            calculateViscosityGPU(dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_mole_fraction,
+                    d_boundary_T_poly, d_boundary_species_viscosities, dataBase_.d_boundary_mu, offset);
+            calculateThermoConductivityGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
+                    dataBase_.d_boundary_y, d_boundary_mole_fraction, d_boundary_species_thermal_conductivities, dataBase_.d_boundary_thermo_alpha, offset);
+            calculateRhoDGPU(boundary_thread, dataBase_.patch_size[i], dataBase_.num_boundary_surfaces, dataBase_.d_boundary_T, d_boundary_T_poly,
+                    dataBase_.d_boundary_p, d_boundary_mole_fraction, d_boundary_mean_mole_weight, dataBase_.d_boundary_rho, dataBase_.d_boundary_thermo_rhoD, offset);
+        }
+        // correct internal field of processor boundary
+        if (dataBase_.patch_type_T[i] == boundaryConditions::processor
+            || dataBase_.patch_type_T[i] == boundaryConditions::processorCyclic) {
+            size_t threads_per_block = 32;
+            size_t blocks_per_grid = (dataBase_.patch_size[i] + threads_per_block - 1) / threads_per_block;
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_T, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_T);
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_he, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_he);
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_thermo_psi, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_thermo_psi);
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_thermo_alpha, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_thermo_alpha);
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_mu, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_mu);
+            correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                    dataBase_.d_rho, dataBase_.d_boundary_face_cell, dataBase_.d_boundary_rho);
+            for (int j = 0; j < num_species; j++) {
+                correct_internal_boundary_field_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.patch_size[i], offset,
+                        dataBase_.d_thermo_rhoD + j * dataBase_.num_cells, dataBase_.d_boundary_face_cell, 
+                        dataBase_.d_boundary_thermo_rhoD + j * dataBase_.num_boundary_surfaces);
+            }
+            offset += 2 * dataBase_.patch_size[i];
+        } else {
+            offset += dataBase_.patch_size[i]; }
+    }
+#ifdef STREAM_ALLOCATOR
+    checkCudaErrors(cudaFreeAsync(d_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mole_fraction, stream));
+    checkCudaErrors(cudaFreeAsync(d_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mean_mole_weight, stream));
+    checkCudaErrors(cudaFreeAsync(d_T_poly, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_T_poly, stream));
+
+    checkCudaErrors(cudaFreeAsync(d_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_viscosities, stream));
+    checkCudaErrors(cudaFreeAsync(d_species_thermal_conductivities, stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_species_thermal_conductivities, stream));
+#endif
+}
+
+void dfThermo::updateRho()
+{
+    int num_thread = 1024;
+    calculateRhoGPU(num_thread, dataBase_.num_cells, dataBase_.d_p, dataBase_.d_thermo_psi, dataBase_.d_rho);
+    calculateRhoGPU(num_thread, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_p, 
+            dataBase_.d_boundary_thermo_psi, dataBase_.d_boundary_rho);
+}
+
+void dfThermo::psip0()
+{
+    int num_thread = 1024;
+    setPsip0(num_thread, dataBase_.num_cells, dataBase_.d_p, dataBase_.d_thermo_psi, d_psip0);
+    setPsip0(num_thread, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_p, 
+            dataBase_.d_boundary_thermo_psi, d_boundary_psip0);
+}
+
+void dfThermo::correctPsipRho()
+{
+    int num_thread = 1024;
+    addPsipRho(num_thread, dataBase_.num_cells, dataBase_.d_p, dataBase_.d_thermo_psi, d_psip0, dataBase_.d_rho);
+    addPsipRho(num_thread, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_p, 
+            dataBase_.d_boundary_thermo_psi, d_boundary_psip0, dataBase_.d_boundary_rho);
+}
+
+void dfThermo::calculateEnergyGradient(int num_thread, int num_cells, int num_species, 
+        int num_boundary_surfaces, int bou_offset, int gradient_offset, const int *face2Cells, 
+        const double *T, const double *p, const double *y, const double *boundary_delta_coeffs,
+        const double *boundary_p, const double *boundary_y, double *boundary_thermo_gradient)
+{
+    size_t threads_per_block = 256;
+    size_t blocks_per_grid = (num_thread + threads_per_block - 1) / threads_per_block;
+    calculate_energy_gradient_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_thread, num_cells, num_species, num_boundary_surfaces,
+            bou_offset, gradient_offset, face2Cells, T, p, y, boundary_p, boundary_y, boundary_delta_coeffs, boundary_thermo_gradient);
+}
+
+void dfThermo::setPsip0(int thread_per_block, int num_thread, const double *p, const double *psi, double *psip0, int offset)
+{
+    size_t blocks_per_grid = (num_thread + thread_per_block - 1) / thread_per_block;
+    
+    calculate_psip0_kernel<<<blocks_per_grid, thread_per_block, 0, stream>>>(num_thread, offset, p, psi, psip0);
+}
+
+void dfThermo::addPsipRho(int thread_per_block, int num_thread, const double *p, const double *psi, const double *psip0, 
+        double *rho, int offset)
+{
+    size_t blocks_per_grid = (num_thread + thread_per_block - 1) / thread_per_block;
+    
+    add_psip_rho_kernel<<<blocks_per_grid, thread_per_block, 0, stream>>>(num_thread, offset, p, psi, psip0, rho);
+}
+
+void dfThermo::updateCPUT(double *h_T, double *h_boundary_T)
+{
+    checkCudaErrors(cudaMemcpyAsync(h_T, dataBase_.d_T, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(h_boundary_T, dataBase_.d_boundary_T, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+}
+
+void dfThermo::compareT(const double *T, const double *boundary_T, bool printFlag)
+{
+    double *h_T = new double[dataBase_.num_cells];
+    double *h_boundary_T = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_T, dataBase_.d_T, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_T, dataBase_.d_boundary_T, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_T\n");
+    checkVectorEqual(dataBase_.num_cells, T, h_T, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_T\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_T, h_boundary_T, 1e-10, printFlag);
+
+    delete h_T;
+    delete h_boundary_T;
+}
+
+void dfThermo::compareHe(const double *he, const double *boundary_he, bool printFlag)
+{
+    double *h_he = new double[dataBase_.num_cells];
+    double *h_boundary_he = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_he, dataBase_.d_he, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_he, dataBase_.d_boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_he\n");
+    checkVectorEqual(dataBase_.num_cells, he, h_he, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_he\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_he, h_boundary_he, 1e-10, printFlag);
+
+    delete h_he;
+    delete h_boundary_he;
+}
+
+void dfThermo::compareRho(const double *rho, const double *boundary_rho, bool printFlag)
+{
+    double *h_rho = new double[dataBase_.num_cells];
+    double *h_boundary_rho = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_rho, dataBase_.d_rho, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_rho, dataBase_.d_boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_rho\n");
+    checkVectorEqual(dataBase_.num_cells, rho, h_rho, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_rho\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rho, h_boundary_rho, 1e-10, printFlag);
+
+    delete h_rho;
+    delete h_boundary_rho;
+}
+
+void dfThermo::comparePsi(const double *psi, const double *boundary_psi, bool printFlag)
+{
+    double *h_psi = new double[dataBase_.num_cells];
+    double *h_boundary_psi = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_psi, dataBase_.d_thermo_psi, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_psi, dataBase_.d_boundary_thermo_psi, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_psi\n");
+    checkVectorEqual(dataBase_.num_cells, psi, h_psi, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_psi\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_psi, h_boundary_psi, 1e-10, printFlag);
+
+    delete h_psi;
+    delete h_boundary_psi;
+}
+
+void dfThermo::compareMu(const double *mu, const double *boundary_mu, bool printFlag)
+{
+    double *h_mu = new double[dataBase_.num_cells];
+    double *h_boundary_mu = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_mu, dataBase_.d_mu, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_mu, dataBase_.d_boundary_mu, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_mu\n");
+    checkVectorEqual(dataBase_.num_cells, mu, h_mu, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_mu\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_mu, h_boundary_mu, 1e-10, printFlag);
+
+    delete h_mu;
+    delete h_boundary_mu;
+}
+
+void dfThermo::compareAlpha(const double *alpha, const double *boundary_alpha, bool printFlag)
+{
+    double *h_alpha = new double[dataBase_.num_cells];
+    double *h_boundary_alpha = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_alpha, dataBase_.d_thermo_alpha, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_alpha, dataBase_.d_boundary_thermo_alpha, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_alpha\n");
+    checkVectorEqual(dataBase_.num_cells, alpha, h_alpha, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_alpha\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_alpha, h_boundary_alpha, 1e-10, printFlag);
+
+    delete h_alpha;
+    delete h_boundary_alpha;
+}
+
+void dfThermo::compareRhoD(const double *rhoD, const double *boundary_rhoD, int species_index, bool printFlag)
+{
+    double *h_rhoD = new double[dataBase_.num_cells];
+    double *h_boundary_rhoD = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_rhoD, dataBase_.d_thermo_rhoD + species_index * dataBase_.num_cells, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_rhoD, dataBase_.d_boundary_thermo_rhoD + species_index * dataBase_.num_boundary_surfaces, 
+                dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_thermo_rhoD\n");
+    checkVectorEqual(dataBase_.num_cells, rhoD, h_rhoD, 1e-10, printFlag);
+    fprintf(stderr, "check h_thermo_boundary_rhoD\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rhoD, h_boundary_rhoD, 1e-10, printFlag);
+
+    delete h_rhoD;
+    delete h_boundary_rhoD;
+}
+
+void dfThermo::correctHe(const double *he, const double *boundary_he)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_he, he, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_he, boundary_he, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
+
+void dfThermo::correctRho(const double *rho, const double *boundary_rho)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_rho, rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_rho, boundary_rho, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
+
+void dfThermo::correctPsi(const double *psi, const double *boundary_psi)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_psi, psi, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_psi, boundary_psi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
+
+void dfThermo::correctMu(const double *mu, const double *boundary_mu)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_mu, mu, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_mu, boundary_mu, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
+
+void dfThermo::correctAlpha(const double *alpha, const double *boundary_alpha)
+{
+    checkCudaErrors(cudaMemcpy(dataBase_.d_thermo_alpha, alpha, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(dataBase_.d_boundary_thermo_alpha, boundary_alpha, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice));
+}
diff --git a/src_gpu/dfUEqn.H b/src_gpu/dfUEqn.H
index ec739db5e..71f8cede5 100644
--- a/src_gpu/dfUEqn.H
+++ b/src_gpu/dfUEqn.H
@@ -3,60 +3,161 @@
 #include "AmgXSolver.H"
 #include <amgx_c.h>
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
 
 class dfUEqn
 {
 private:
-    dfMatrixDataBase &dataBase_;
-    cudaStream_t stream;
-    AmgXSolver *UxSolver, *UySolver, *UzSolver = nullptr;
-    int num_iteration;
-
-    // common variables
-    int num_cells, cell_bytes, num_faces, num_surfaces, cell_vec_bytes, csr_value_vec_bytes, num_boundary_cells;
-    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
-
-    // Matrix variables
-    double *d_A_csr, *d_b, *d_psi, *d_psi_permute, *d_H, *d_H_permute, *d_A;
-    double *h_A_csr, *h_b, *h_psi, *h_H, *h_A = nullptr;
+	dfMatrixDataBase &dataBase_;
 
-    double *d_ueqn_internal_coeffs, *d_ueqn_boundary_coeffs= nullptr;
+    // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph_pre, graph_post;
+    cudaGraphExec_t graph_instance_pre, graph_instance_post;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
+
+	// constant values -- amgx solvers
+	AmgXSolver *UxSolver = nullptr;
+	AmgXSolver *UySolver = nullptr;
+	AmgXSolver *UzSolver = nullptr;
+    int num_iteration = 0;
+
+	// constant fields - internal
+	// 无
+
+	// constant fields - boundary
+	std::vector<int> patch_type;
+
+	// non-constant fields - internal
+	// thermophysical fields
+	double *d_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_nu_eff = nullptr;
+    double *h_A_pEqn = nullptr;
+    double *h_H_pEqn = nullptr;
+	// intermediate fields
+	double *d_grad_u = nullptr;
+    double *d_delta = nullptr;
+
+	double *d_rho_nueff = nullptr;
+	double *d_u_host_order = nullptr;
+    double *d_fvc_output = nullptr; // TODO: no need anymore
+
+	// non-constant fields - boundary
+	// thermophysical fields
+	double *d_boundary_nu_eff = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_boundary_nu_eff = nullptr;
+	// intermediate fields
+	double *d_boundary_grad_u = nullptr;
+	double *d_boundary_rho_nueff = nullptr;
+    // boundary coeff fields
+	double *d_value_internal_coeffs = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+    // intermediate fields
+    double *d_boundary_u_host_order = nullptr;
+
+	// non-constant fields - ldu
+    double *d_ldu = nullptr;
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+	double *d_diag = nullptr;
+    double *d_extern = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+    double *d_diag_vector = nullptr;
+    double *d_A_pEqn = nullptr;
+    double *d_H_pEqn = nullptr;
+    double *d_H_pEqn_perm = nullptr;
+
+	// non-constant fields - csr
+	double *d_A = nullptr;
+	double *d_b = nullptr; // TODO: needless
+    double *d_ldu_solve = nullptr;
+    double *d_extern_solve = nullptr;
+	double *d_source_solve = nullptr;
+	double *d_internal_coeffs_solve = nullptr;
+	double *d_boundary_coeffs_solve = nullptr;
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
 public:
-    dfUEqn();
-    dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile);
-    ~dfUEqn();
-
-    void checkValue(bool print);
-
-    void fvm_ddt(double *vector_old);
+	// constructor
+    dfUEqn(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {}
 
-    void fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
-                 double *boundary_nuEff_init, double *boundary_rho_init);
+	// destructor
+	~dfUEqn(){}
 
-    void fvc_grad(double *pressure);
+	// member function
 
-    void fvc_grad_vector();
+    // getter function
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
 
-    void dev2T();
+	// initialization
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path); 
+	void setConstantFields(const std::vector<int> patch_type);
+	void createNonConstantFieldsInternal();
+	void createNonConstantFieldsBoundary();
+	void createNonConstantLduAndCsrFields();
+	// dfUEqn has no internal non-constant fields to be init
+	void initNonConstantFieldsInternal(const double *u, const double *boundary_u);
+	void initNonConstantFieldsBoundary();
 
-    void fvc_div_tensor(const double *nuEff);
+    void cleanCudaResources();
 
-    void fvm_laplacian();
+	// run equation
+    void preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi);
+	void preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, const double *h_nu_eff, const double *h_boundary_nu_eff);
+	void process();
+	void postProcess();
 
     void A(double *Psi);
-
     void H(double *Psi);
+    void getrAU(cudaStream_t stream, ncclComm_t comm, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *neighbor_peer, int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_delta_coeffs, const double *internal_coeffs, const double *volume, 
+        const double *diag, double *rAU, double *boundary_rAU);
+    void UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume, const double *u,
+        int num_patches, const int *patch_size, const int *patch_type, const int *patch_type_U,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, const double *boundary_weight,
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        const double *rAU, const double *boundary_rAU, const double *boundary_u,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        double *HbyA, double *boundary_HbyA);
+    void getHbyA();
+    void getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, 
+        const double *grad_U_tsr, const double *volume, double *delta, double *turbulence_k, double *turbulence_epsilon);
+    void correctPsi(double *Psi, double *boundary_psi);
+    void ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        int num_patches, const int *patch_size, const int *patch_type, const double *vf, const double *boundary_vf, 
+        const double *ldu, double *external, const double *source, const double *internal_coeffs, const double *boundary_coeffs,
+        const int *cyclicNeighbor, const int *patchSizeOffset, double *A, double *b);
 
     void solve();
-
     void sync();
 
-    void updatePsi(double *Psi);
-
-    void correctBoundaryConditions();
-
-    void correctPsi(double *Psi);
-
-    void initializeTimeStep();
+// #if defined DEBUG_
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+    // const double *tmpVal, const double *boundary_val,
+    bool printFlag);
+// #endif 
+    void compareHbyA(const double *HbyA, const double *boundary_HbyA, bool printFlag);
+    void comparerAU(const double *rAU, const double *boundary_rAU, bool printFlag);
+    void compareU(const double *U, const double *boundary_U, bool printFlag);
 };
diff --git a/src_gpu/dfUEqn.cu b/src_gpu/dfUEqn.cu
index 56983e038..4b828e170 100644
--- a/src_gpu/dfUEqn.cu
+++ b/src_gpu/dfUEqn.cu
@@ -1,1481 +1,1112 @@
 #include "dfUEqn.H"
 
-// kernel functions
-__global__ void fvm_ddt_kernel(int num_cells, int num_faces, const double rdelta_t,
-                               const int *csr_row_index, const int *csr_diag_index,
-                               const double *rho_old, const double *rho_new, const double *volume, const double *velocity_old,
-                               const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output, double *psi)
+__global__ void addAveInternaltoDiagUeqn(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, double *A_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_boundary_surfaces)
         return;
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + ddt_diag;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + ddt_diag;
-
-    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + ddt_part_term * velocity_old[index * 3 + 0];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + ddt_part_term * velocity_old[index * 3 + 1];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + ddt_part_term * velocity_old[index * 3 + 2];
-
-    psi[num_cells * 0 + index] = velocity_old[index * 3 + 0];
-    psi[num_cells * 1 + index] = velocity_old[index * 3 + 1];
-    psi[num_cells * 2 + index] = velocity_old[index * 3 + 2];
-}
+    int cellIndex = face2Cells[index];
 
-__global__ void fvm_div_internal(int num_cells, int num_faces,
-                                 const int *csr_row_index, const int *csr_diag_index,
-                                 const double *weight, const double *phi,
-                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
 
-    double div_diag = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (-w) * f;
-            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (-w) * f;
-            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (-w) * f;
-            // lower neighbors contribute to sum of -1
-            div_diag += (w - 1) * f;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            // upper, index - 1, consider of diag
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + (1 - w) * f;
-            A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + (1 - w) * f;
-            A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + (1 - w) * f;
-            // upper neighbors contribute to sum of 1
-            div_diag += w * f;
-        }
-    }
-    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + div_diag; // diag
-    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + div_diag; // diag
+    atomicAdd(&A_pEqn[cellIndex], ave_internal);
 }
 
-__global__ void fvm_div_boundary(int num_cells, int num_faces, int num_boundary_cells,
-                                 const int *csr_row_index, const int *csr_diag_index,
-                                 const int *boundary_cell_offset, const int *boundary_cell_id,
-                                 const double *internal_coeffs, const double *boundary_coeffs,
-                                 const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
-                                 double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
+__global__ void divide_cell_volume_scalar_reverse(int num_cells, const double* volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_cells)
         return;
+    
+    double vol = volume[index];
 
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-
-    // construct internalCoeffs & boundaryCoeffs
-    double internal_coeffs_x = 0;
-    double internal_coeffs_y = 0;
-    double internal_coeffs_z = 0;
-    double boundary_coeffs_x = 0;
-    double boundary_coeffs_y = 0;
-    double boundary_coeffs_z = 0;
-    for (int i = 0; i < loop_size; i++)
-    {
-        internal_coeffs_x += internal_coeffs[(cell_offset + i) * 3 + 0];
-        internal_coeffs_y += internal_coeffs[(cell_offset + i) * 3 + 1];
-        internal_coeffs_z += internal_coeffs[(cell_offset + i) * 3 + 2];
-        boundary_coeffs_x += boundary_coeffs[(cell_offset + i) * 3 + 0];
-        boundary_coeffs_y += boundary_coeffs[(cell_offset + i) * 3 + 1];
-        boundary_coeffs_z += boundary_coeffs[(cell_offset + i) * 3 + 2];
-    }
-    ueqn_internal_coeffs[cell_index * 3 + 0] = internal_coeffs_x;
-    ueqn_internal_coeffs[cell_index * 3 + 1] = internal_coeffs_y;
-    ueqn_internal_coeffs[cell_index * 3 + 2] = internal_coeffs_z;
-    ueqn_boundary_coeffs[cell_index * 3 + 0] = boundary_coeffs_x;
-    ueqn_boundary_coeffs[cell_index * 3 + 1] = boundary_coeffs_y;
-    ueqn_boundary_coeffs[cell_index * 3 + 2] = boundary_coeffs_z;
-
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z;
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z;
+    output[index] = 1/ (output[index] / vol);
 }
 
-__global__ void fvc_grad_internal_face(int num_cells,
-                                       const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                       const double *face_vector, const double *weight, const double *pressure,
-                                       const double *b_input, double *b_output)
+__global__ void get_calculated_field_boundary(int num_boundary_surfaces, const double* output, 
+        const int *face2Cells, double *boundary_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_boundary_surfaces)
         return;
+    
+    int cellIndex = face2Cells[index];
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_cell_p = pressure[index];
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    double grad_bx_low = 0;
-    double grad_bx_upp = 0;
-    double grad_by_low = 0;
-    double grad_by_upp = 0;
-    double grad_bz_low = 0;
-    double grad_bz_upp = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = weight[neighbor_index];
-            double sfx = face_vector[neighbor_index * 3 + 0];
-            double sfy = face_vector[neighbor_index * 3 + 1];
-            double sfz = face_vector[neighbor_index * 3 + 2];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_p = pressure[neighbor_cell_id];
-            double face_p = (1 - w) * own_cell_p + w * neighbor_cell_p;
-            grad_bx_low -= face_p * sfx;
-            grad_by_low -= face_p * sfy;
-            grad_bz_low -= face_p * sfz;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = weight[neighbor_index];
-            double sfx = face_vector[neighbor_index * 3 + 0];
-            double sfy = face_vector[neighbor_index * 3 + 1];
-            double sfz = face_vector[neighbor_index * 3 + 2];
-            int neighbor_cell_id = csr_col_index[row_index + inner_index];
-            double neighbor_cell_p = pressure[neighbor_cell_id];
-            double face_p = w * own_cell_p + (1 - w) * neighbor_cell_p;
-            grad_bx_upp += face_p * sfx;
-            grad_by_upp += face_p * sfy;
-            grad_bz_upp += face_p * sfz;
-        }
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] - grad_bx_low - grad_bx_upp;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] - grad_by_low - grad_by_upp;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] - grad_bz_low - grad_bz_upp;
+    boundary_output[index] = output[cellIndex];
 }
 
-__global__ void fvc_grad_boundary_face(int num_cells, int num_boundary_cells,
-                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                       const double *boundary_face_vector, const double *boundary_pressure,
-                                       const double *b_input, double *b_output)
+__global__ void ueqn_addBoundaryDiag(int num_cells, int num_boundary_surfaces, const int *face2Cells, 
+        const double *internal_coeffs, const double *psi, double *H_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_boundary_surfaces)
         return;
+    
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double internal_x = internal_coeffs[num_boundary_surfaces * 0 + index];
+    double internal_y = internal_coeffs[num_boundary_surfaces * 1 + index];
+    double internal_z = internal_coeffs[num_boundary_surfaces * 2 + index];
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
+    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
+    double ave_internal = (internal_x + internal_y + internal_z) / 3;
 
-    // compute boundary gradient
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sfx = boundary_face_vector[i * 3 + 0];
-        double sfy = boundary_face_vector[i * 3 + 1];
-        double sfz = boundary_face_vector[i * 3 + 2];
-        double face_p = boundary_pressure[i];
-        grad_bx += face_p * sfx;
-        grad_by += face_p * sfy;
-        grad_bz += face_p * sfz;
-    }
+    int cellIndex = face2Cells[index];
 
-    //// correct the boundary gradient
-    // double nx = boundary_face_vector[face_index * 3 + 0] / magSf[face_index];
-    // double ny = boundary_face_vector[face_index * 3 + 1] / magSf[face_index];
-    // double nz = boundary_face_vector[face_index * 3 + 2] / magSf[face_index];
-    // double sn_grad = 0;
-    // double grad_correction = sn_grad * volume[cell_index] - (nx * grad_bx + ny * grad_by + nz * grad_bz);
-    // grad_bx += nx * grad_correction;
-    // grad_by += ny * grad_correction;
-    // grad_bz += nz * grad_correction;
-
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] - grad_bx;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] - grad_by;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] - grad_bz;
+    // if (index == 0)
+    // {
+    //     printf("gpu H_pEqn[8680] = %.20e\n", H_pEqn[8680]);
+    // }
+
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], (-internal_x + ave_internal) * psi[num_cells * 0 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], (-internal_y + ave_internal) * psi[num_cells * 1 + cellIndex]);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], (-internal_z + ave_internal) * psi[num_cells * 2 + cellIndex]);
 }
 
-__global__ void add_fvMatrix_kernel(int num_cells, int num_faces,
-                                    const int *csr_row_index,
-                                    const double *turbSrc_A, const double *turbSrc_b,
-                                    const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+__global__ void ueqn_lduMatrix_H(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index, const double *lower, const double *upper,
+        const double *psi, double *H_pEqn)
 {
+    /*
+    for (label face=0; face<nFaces; face++)
+    {
+        HpsiPtr[uPtr[face]] -= lowerPtr[face]*psiPtr[lPtr[face]];
+        HpsiPtr[lPtr[face]] -= upperPtr[face]*psiPtr[uPtr[face]];
+    }*/
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_surfaces)
         return;
+    
+    int l = lower_index[index];
+    int u = upper_index[index];
 
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int csr_dim = num_cells + num_faces;
-    double A_entry;
+    atomicAdd(&H_pEqn[num_cells * 0 + u], -lower[index] * psi[num_cells * 0 + l]);
+    atomicAdd(&H_pEqn[num_cells * 1 + u], -lower[index] * psi[num_cells * 1 + l]);
+    atomicAdd(&H_pEqn[num_cells * 2 + u], -lower[index] * psi[num_cells * 2 + l]);
+    atomicAdd(&H_pEqn[num_cells * 0 + l], -upper[index] * psi[num_cells * 0 + u]);
+    atomicAdd(&H_pEqn[num_cells * 1 + l], -upper[index] * psi[num_cells * 1 + u]);
+    atomicAdd(&H_pEqn[num_cells * 2 + l], -upper[index] * psi[num_cells * 2 + u]);
 
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        A_entry = turbSrc_A[i];
-        A_csr_output[csr_dim * 0 + i] = A_csr_input[csr_dim * 0 + i] + A_entry;
-        A_csr_output[csr_dim * 1 + i] = A_csr_input[csr_dim * 1 + i] + A_entry;
-        A_csr_output[csr_dim * 2 + i] = A_csr_input[csr_dim * 2 + i] + A_entry;
-    }
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + turbSrc_b[index * 3 + 0];
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + turbSrc_b[index * 3 + 1];
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + turbSrc_b[index * 3 + 2];
 }
 
-__global__ void offdiagPermutation(const int num_faces, const int *permedIndex,
-                                   const double *d_phi_init, double *d_phi)
+__global__ void ueqn_addBoundarySrc_unCoupled(int num_cells, int num, int offset, 
+        int num_boundary_surfaces, const int *face2Cells, const double *boundary_coeffs, double *H_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_faces)
+    if (index >= num)
         return;
 
-    int p = permedIndex[index];
-    d_phi[index] = d_phi_init[p];
-}
+    int start_index = offset + index;
 
-__global__ void boundaryPermutation(const int num_boundary_faces, const int *bouPermedIndex,
-                                    const double *boundary_pressure_init, const double *boundary_velocity_init,
-                                    double *boundary_pressure, double *boundary_velocity,
-                                    double *boundary_nuEff_init, double *boundary_nuEff,
-                                    double *boundary_rho_init, double *boundary_rho)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
-        return;
+    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
+    // boundaryDiagCmpt.negate();
+    double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + start_index];
+    double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + start_index];
+    double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + start_index];
 
-    int p = bouPermedIndex[index];
-    boundary_velocity[3 * index] = boundary_velocity_init[3 * p];
-    boundary_velocity[3 * index + 1] = boundary_velocity_init[3 * p + 1];
-    boundary_velocity[3 * index + 2] = boundary_velocity_init[3 * p + 2];
-    boundary_pressure[index] = boundary_pressure_init[p];
-    boundary_rho[index] = boundary_rho_init[p];
-    boundary_nuEff[index] = boundary_nuEff_init[p];
-}
+    int cellIndex = face2Cells[start_index];
 
-__global__ void fvc_grad_vector_internal(int num_cells,
-                                         const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                         const double *sf, const double *vf, const double *tlambdas, const double *volume,
-                                         double *grad)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+    // do not permute H anymore
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], boundary_x);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], boundary_y);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], boundary_z);
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_vf_x = vf[index * 3 + 0];
-    double own_vf_y = vf[index * 3 + 1];
-    double own_vf_z = vf[index * 3 + 2];
-    double grad_xx = 0;
-    double grad_xy = 0;
-    double grad_xz = 0;
-    double grad_yx = 0;
-    double grad_yy = 0;
-    double grad_yz = 0;
-    double grad_zx = 0;
-    double grad_zy = 0;
-    double grad_zz = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = (1 - w) * own_vf_x + w * neighbor_vf_x;
-        double face_y = (1 - w) * own_vf_y + w * neighbor_vf_y;
-        double face_z = (1 - w) * own_vf_z + w * neighbor_vf_z;
-        grad_xx -= sf_x * face_x;
-        grad_xy -= sf_x * face_y;
-        grad_xz -= sf_x * face_z;
-        grad_yx -= sf_y * face_x;
-        grad_yy -= sf_y * face_y;
-        grad_yz -= sf_y * face_z;
-        grad_zx -= sf_z * face_x;
-        grad_zy -= sf_z * face_y;
-        grad_zz -= sf_z * face_z;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_x = vf[neighbor_cell_id * 3 + 0];
-        double neighbor_vf_y = vf[neighbor_cell_id * 3 + 1];
-        double neighbor_vf_z = vf[neighbor_cell_id * 3 + 2];
-        double face_x = w * own_vf_x + (1 - w) * neighbor_vf_x;
-        double face_y = w * own_vf_y + (1 - w) * neighbor_vf_y;
-        double face_z = w * own_vf_z + (1 - w) * neighbor_vf_z;
-        grad_xx += sf_x * face_x;
-        grad_xy += sf_x * face_y;
-        grad_xz += sf_x * face_z;
-        grad_yx += sf_y * face_x;
-        grad_yy += sf_y * face_y;
-        grad_yz += sf_y * face_z;
-        grad_zx += sf_z * face_x;
-        grad_zy += sf_z * face_y;
-        grad_zz += sf_z * face_z;
-        // if (index == 0)
-        // {
-        //     printf("grad_xx = %.20lf\n", grad_xx);
-        //     // printf("sf_x = %.20lf\n", sf_x);
-        //     // printf("face_x = %.20lf\n", face_x);
-        // }
-    }
-    double vol = volume[index];
-    grad[index * 9 + 0] = grad_xx / vol;
-    grad[index * 9 + 1] = grad_xy / vol;
-    grad[index * 9 + 2] = grad_xz / vol;
-    grad[index * 9 + 3] = grad_yx / vol;
-    grad[index * 9 + 4] = grad_yy / vol;
-    grad[index * 9 + 5] = grad_yz / vol;
-    grad[index * 9 + 6] = grad_zx / vol;
-    grad[index * 9 + 7] = grad_zy / vol;
-    grad[index * 9 + 8] = grad_zz / vol;
 }
 
-__global__ void fvc_grad_vector_boundary(int num_cells, int num_boundary_cells,
-                                         const int *boundary_cell_offset, const int *boundary_cell_id,
-                                         const double *boundary_sf, const double *boundary_vf, const double *volume,
-                                         double *grad, double *grad_boundary_init)
+__global__ void ueqn_addBoundarySrc_processor(int num_cells, int num, int offset, 
+        int num_boundary_surfaces, const int *face2Cells, const double *boundary_coeffs, 
+        const double *vf_boundary, double *H_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double grad_xx = 0;
-    double grad_xy = 0;
-    double grad_xz = 0;
-    double grad_yx = 0;
-    double grad_yy = 0;
-    double grad_yz = 0;
-    double grad_zx = 0;
-    double grad_zy = 0;
-    double grad_zz = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sf_x = boundary_sf[i * 3 + 0];
-        double sf_y = boundary_sf[i * 3 + 1];
-        double sf_z = boundary_sf[i * 3 + 2];
-        double vf_x = boundary_vf[i * 3 + 0];
-        double vf_y = boundary_vf[i * 3 + 1];
-        double vf_z = boundary_vf[i * 3 + 2];
-        grad_xx += sf_x * vf_x;
-        grad_xy += sf_x * vf_y;
-        grad_xz += sf_x * vf_z;
-        grad_yx += sf_y * vf_x;
-        grad_yy += sf_y * vf_y;
-        grad_yz += sf_y * vf_z;
-        grad_zx += sf_z * vf_x;
-        grad_zy += sf_z * vf_y;
-        grad_zz += sf_z * vf_z;
-    }
+    int neighbor_start_index = offset + index;
 
-    double vol = volume[cell_index];
-
-    grad[cell_index * 9 + 0] += grad_xx / vol;
-    grad[cell_index * 9 + 1] += grad_xy / vol;
-    grad[cell_index * 9 + 2] += grad_xz / vol;
-    grad[cell_index * 9 + 3] += grad_yx / vol;
-    grad[cell_index * 9 + 4] += grad_yy / vol;
-    grad[cell_index * 9 + 5] += grad_yz / vol;
-    grad[cell_index * 9 + 6] += grad_zx / vol;
-    grad[cell_index * 9 + 7] += grad_zy / vol;
-    grad[cell_index * 9 + 8] += grad_zz / vol;
-
-    grad_boundary_init[index * 9 + 0] = grad[cell_index * 9 + 0];
-    grad_boundary_init[index * 9 + 1] = grad[cell_index * 9 + 1];
-    grad_boundary_init[index * 9 + 2] = grad[cell_index * 9 + 2];
-    grad_boundary_init[index * 9 + 3] = grad[cell_index * 9 + 3];
-    grad_boundary_init[index * 9 + 4] = grad[cell_index * 9 + 4];
-    grad_boundary_init[index * 9 + 5] = grad[cell_index * 9 + 5];
-    grad_boundary_init[index * 9 + 6] = grad[cell_index * 9 + 6];
-    grad_boundary_init[index * 9 + 7] = grad[cell_index * 9 + 7];
-    grad_boundary_init[index * 9 + 8] = grad[cell_index * 9 + 8];
-    // if (index == 1)
-    // {
-    //     printf("grad[1] = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", grad[index * 9 + 0], grad[index * 9 + 1], grad[index * 9 + 2],
-    //             grad[index * 9 + 3], grad[index * 9 + 4], grad[index * 9 + 5], grad[index * 9 + 6], grad[index * 9 + 7], grad[index * 9 + 8]);
-    // }
-}
+    double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + neighbor_start_index];
+    double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + neighbor_start_index];
+    double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + neighbor_start_index];
+    double boundary_vf_x = vf_boundary[num_boundary_surfaces * 0 + neighbor_start_index];
+    double boundary_vf_y = vf_boundary[num_boundary_surfaces * 1 + neighbor_start_index];
+    double boundary_vf_z = vf_boundary[num_boundary_surfaces * 2 + neighbor_start_index];
 
-__global__ void correct_boundary_conditions(int num_boundary_cells,
-                                            const int *boundary_cell_offset, const int *boundary_cell_id,
-                                            const double *boundary_sf, const double *mag_sf,
-                                            double *boundary_grad_init, double *boundary_grad, const double *boundary_deltaCoeffs,
-                                            const double *internal_velocity, const double *boundary_velocity, const int *U_patch_type)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
+    int cellIndex = face2Cells[neighbor_start_index];
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // initialize boundary_grad
-    double grad_xx = boundary_grad_init[index * 9 + 0];
-    double grad_xy = boundary_grad_init[index * 9 + 1];
-    double grad_xz = boundary_grad_init[index * 9 + 2];
-    double grad_yx = boundary_grad_init[index * 9 + 3];
-    double grad_yy = boundary_grad_init[index * 9 + 4];
-    double grad_yz = boundary_grad_init[index * 9 + 5];
-    double grad_zx = boundary_grad_init[index * 9 + 6];
-    double grad_zy = boundary_grad_init[index * 9 + 7];
-    double grad_zz = boundary_grad_init[index * 9 + 8];
-
-    double internal_U_x = internal_velocity[cell_index * 3 + 0];
-    double internal_U_y = internal_velocity[cell_index * 3 + 1];
-    double internal_U_z = internal_velocity[cell_index * 3 + 2];
-
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        // OpenFoam code
-        // const vectorField n
-        //     (
-        //      vsf.mesh().Sf().boundaryField()[patchi]
-        //      / vsf.mesh().magSf().boundaryField()[patchi]
-        //     );
-        // gGradbf[patchi] += n *
-        //     (
-        //      vsf.boundaryField()[patchi].snGrad()
-        //      - (n & gGradbf[patchi])
-        //     );
-        // template<class Type> // fixedValue
-        // Foam::tmp<Foam::Field<Type>> Foam::fvPatchField<Type>::snGrad() const
-        // {
-        //     return patch_.deltaCoeffs()*(*this - patchInternalField());
-        // }
-
-        double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
-        double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
-        double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
-
-        double sn_grad_x, sn_grad_y, sn_grad_z;
-        int patchIndex = U_patch_type[i];
-        if (patchIndex == 0) { // zeroGradient
-            sn_grad_x = 0;
-            sn_grad_y = 0;
-            sn_grad_z = 0;
-        } else if (patchIndex == 1) { // fixedValue
-            sn_grad_x = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 0] - internal_velocity[cell_index * 3 + 0]);
-            sn_grad_y = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 1] - internal_velocity[cell_index * 3 + 1]);
-            sn_grad_z = boundary_deltaCoeffs[i] * (boundary_velocity[i * 3 + 2] - internal_velocity[cell_index * 3 + 2]);
-            // if (index == 1)
-            // {
-            //     printf("cell_index = %d\n", cell_index);
-            //     printf("boundary_velocity = %e\n", boundary_velocity[i * 3 + 1]);
-            //     printf("internal_velocity = %e\n", internal_velocity[cell_index * 3 + 0]);
-            // }
-            
-        }
-        // TODO: implement other BCs
-        double grad_correction_x = sn_grad_x - (n_x * grad_xx + n_y * grad_yx + n_z * grad_zx);
-        double grad_correction_y = sn_grad_y - (n_x * grad_xy + n_y * grad_yy + n_z * grad_zy);
-        double grad_correction_z = sn_grad_z - (n_x * grad_xz + n_y * grad_yz + n_z * grad_zz);
-        boundary_grad[i * 9 + 0] = grad_xx + n_x * grad_correction_x;
-        boundary_grad[i * 9 + 1] = grad_xy + n_x * grad_correction_y;
-        boundary_grad[i * 9 + 2] = grad_xz + n_x * grad_correction_z;
-        boundary_grad[i * 9 + 3] = grad_yx + n_y * grad_correction_x;
-        boundary_grad[i * 9 + 4] = grad_yy + n_y * grad_correction_y;
-        boundary_grad[i * 9 + 5] = grad_yz + n_y * grad_correction_z;
-        boundary_grad[i * 9 + 6] = grad_zx + n_z * grad_correction_x;
-        boundary_grad[i * 9 + 7] = grad_zy + n_z * grad_correction_y;
-        boundary_grad[i * 9 + 8] = grad_zz + n_z * grad_correction_z;
-        // if (index == 1)
-        // {
-        //     printf("boundary_grad = (%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf)\n", boundary_grad[i * 9 + 0], boundary_grad[i * 9 + 1], boundary_grad[i * 9 + 2],
-        //         boundary_grad[i * 9 + 3], boundary_grad[i * 9 + 4], boundary_grad[i * 9 + 5], boundary_grad[i * 9 + 6], boundary_grad[i * 9 + 7], boundary_grad[i * 9 + 8]);
-        // }
-        
-    }
+    atomicAdd(&H_pEqn[num_cells * 0 + cellIndex], boundary_x * boundary_vf_x);
+    atomicAdd(&H_pEqn[num_cells * 1 + cellIndex], boundary_y * boundary_vf_y);
+    atomicAdd(&H_pEqn[num_cells * 2 + cellIndex], boundary_z * boundary_vf_z);
 }
 
-__global__ void dev2_t_tensor(int num, double *tensor)
+__global__ void ueqn_addBoundarySrc_cyclic(int num_cells, int num, int internal_offset,
+        int neighbor_offset, int num_boundary_surfaces, const int *face2Cells, 
+        const double *boundary_coeffs, const double *vf, double *H_pEqn)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
         return;
 
-    double t_xx = tensor[index * 9 + 0];
-    double t_xy = tensor[index * 9 + 1];
-    double t_xz = tensor[index * 9 + 2];
-    double t_yx = tensor[index * 9 + 3];
-    double t_yy = tensor[index * 9 + 4];
-    double t_yz = tensor[index * 9 + 5];
-    double t_zx = tensor[index * 9 + 6];
-    double t_zy = tensor[index * 9 + 7];
-    double t_zz = tensor[index * 9 + 8];
-    double trace_coeff = (2. / 3.) * (t_xx + t_yy + t_zz);
-    tensor[index * 9 + 0] = t_xx - trace_coeff;
-    tensor[index * 9 + 1] = t_yx;
-    tensor[index * 9 + 2] = t_zx;
-    tensor[index * 9 + 3] = t_xy;
-    tensor[index * 9 + 4] = t_yy - trace_coeff;
-    tensor[index * 9 + 5] = t_zy;
-    tensor[index * 9 + 6] = t_xz;
-    tensor[index * 9 + 7] = t_yz;
-    tensor[index * 9 + 8] = t_zz - trace_coeff;
-}
-
-__global__ void fvc_div_tensor_internal(int num_cells,
-                                        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                        const double *scalar0, const double *scalar1,
-                                        const double *sf, const double *vf, const double *tlambdas, const double *volume,
-                                        const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double coeff_own = scalar0[index] * scalar1[index];
-
-    double own_vf_xx = vf[index * 9 + 0];
-    double own_vf_xy = vf[index * 9 + 1];
-    double own_vf_xz = vf[index * 9 + 2];
-    double own_vf_yx = vf[index * 9 + 3];
-    double own_vf_yy = vf[index * 9 + 4];
-    double own_vf_yz = vf[index * 9 + 5];
-    double own_vf_zx = vf[index * 9 + 6];
-    double own_vf_zy = vf[index * 9 + 7];
-    double own_vf_zz = vf[index * 9 + 8];
-    double sum_x = 0;
-    double sum_y = 0;
-    double sum_z = 0;
-
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
-        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
-        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
-        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
-        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
-        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
-        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
-        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
-        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
-        double face_xx = (1 - w) * own_vf_xx * coeff_own + w * neighbor_vf_xx * coeff_nei;
-        double face_xy = (1 - w) * own_vf_xy * coeff_own + w * neighbor_vf_xy * coeff_nei;
-        double face_xz = (1 - w) * own_vf_xz * coeff_own + w * neighbor_vf_xz * coeff_nei;
-        double face_yx = (1 - w) * own_vf_yx * coeff_own + w * neighbor_vf_yx * coeff_nei;
-        double face_yy = (1 - w) * own_vf_yy * coeff_own + w * neighbor_vf_yy * coeff_nei;
-        double face_yz = (1 - w) * own_vf_yz * coeff_own + w * neighbor_vf_yz * coeff_nei;
-        double face_zx = (1 - w) * own_vf_zx * coeff_own + w * neighbor_vf_zx * coeff_nei;
-        double face_zy = (1 - w) * own_vf_zy * coeff_own + w * neighbor_vf_zy * coeff_nei;
-        double face_zz = (1 - w) * own_vf_zz * coeff_own + w * neighbor_vf_zz * coeff_nei;
-        sum_x -= sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
-        sum_y -= sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
-        sum_z -= sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double coeff_nei = scalar0[neighbor_cell_id] * scalar1[neighbor_cell_id];
-        double w = tlambdas[neighbor_index];
-        double sf_x = sf[neighbor_index * 3 + 0];
-        double sf_y = sf[neighbor_index * 3 + 1];
-        double sf_z = sf[neighbor_index * 3 + 2];
-        double neighbor_vf_xx = vf[neighbor_cell_id * 9 + 0];
-        double neighbor_vf_xy = vf[neighbor_cell_id * 9 + 1];
-        double neighbor_vf_xz = vf[neighbor_cell_id * 9 + 2];
-        double neighbor_vf_yx = vf[neighbor_cell_id * 9 + 3];
-        double neighbor_vf_yy = vf[neighbor_cell_id * 9 + 4];
-        double neighbor_vf_yz = vf[neighbor_cell_id * 9 + 5];
-        double neighbor_vf_zx = vf[neighbor_cell_id * 9 + 6];
-        double neighbor_vf_zy = vf[neighbor_cell_id * 9 + 7];
-        double neighbor_vf_zz = vf[neighbor_cell_id * 9 + 8];
-        double face_xx = w * own_vf_xx * coeff_own + (1 - w) * neighbor_vf_xx * coeff_nei;
-        double face_xy = w * own_vf_xy * coeff_own + (1 - w) * neighbor_vf_xy * coeff_nei;
-        double face_xz = w * own_vf_xz * coeff_own + (1 - w) * neighbor_vf_xz * coeff_nei;
-        double face_yx = w * own_vf_yx * coeff_own + (1 - w) * neighbor_vf_yx * coeff_nei;
-        double face_yy = w * own_vf_yy * coeff_own + (1 - w) * neighbor_vf_yy * coeff_nei;
-        double face_yz = w * own_vf_yz * coeff_own + (1 - w) * neighbor_vf_yz * coeff_nei;
-        double face_zx = w * own_vf_zx * coeff_own + (1 - w) * neighbor_vf_zx * coeff_nei;
-        double face_zy = w * own_vf_zy * coeff_own + (1 - w) * neighbor_vf_zy * coeff_nei;
-        double face_zz = w * own_vf_zz * coeff_own + (1 - w) * neighbor_vf_zz * coeff_nei;
-        sum_x += sf_x * face_xx + sf_y * face_yx + sf_z * face_zx;
-        sum_y += sf_x * face_xy + sf_y * face_yy + sf_z * face_zy;
-        sum_z += sf_x * face_xz + sf_y * face_yz + sf_z * face_zz;
-    }
-    double vol = volume[index];
-    b_output[num_cells * 0 + index] = b_input[num_cells * 0 + index] + sum_x * sign;
-    b_output[num_cells * 1 + index] = b_input[num_cells * 1 + index] + sum_y * sign;
-    b_output[num_cells * 2 + index] = b_input[num_cells * 2 + index] + sum_z * sign;
-}
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
 
-__global__ void fvc_div_tensor_boundary(int num_cells, int num_boundary_cells,
-                                        const int *boundary_cell_offset, const int *boundary_cell_id,
-                                        const double *boundary_scalar0, const double *boundary_scalar1,
-                                        const double *boundary_sf, const double *boundary_vf, const double *volume,
-                                        const double sign, const double *b_input, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
+    double boundary_x = boundary_coeffs[num_boundary_surfaces * 0 + internal_start_index];
+    double boundary_y = boundary_coeffs[num_boundary_surfaces * 1 + internal_start_index];
+    double boundary_z = boundary_coeffs[num_boundary_surfaces * 2 + internal_start_index];
+    double boundary_vf_x = vf[num_cells * 0 + neighbor_cellIndex];
+    double boundary_vf_y = vf[num_cells * 1 + neighbor_cellIndex];
+    double boundary_vf_z = vf[num_cells * 2 + neighbor_cellIndex];
 
-    // OpenFoam code
-    // Foam::surfaceInterpolationScheme<Type>::dotInterpolate
-    // if (vf.boundaryField()[pi].coupled())
-    // {
-    //     psf =
-    //         pSf
-    //         & (
-    //                 pLambda*vf.boundaryField()[pi].patchInternalField()
-    //                 + (1.0 - pLambda)*vf.boundaryField()[pi].patchNeighbourField()
-    //           );
-    // }
-    // else
-    // {
-    //     psf = pSf & vf.boundaryField()[pi];
-    // }
-    // tmp<GeometricField<Type, fvPatchField, volMesh>> surfaceIntegrate
-    // forAll(mesh.boundary()[patchi], facei)
-    // {
-    //     ivf[pFaceCells[facei]] += pssf[facei];
-    // }
-    double sum_x = 0;
-    double sum_y = 0;
-    double sum_z = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sf_x = boundary_sf[i * 3 + 0];
-        double sf_y = boundary_sf[i * 3 + 1];
-        double sf_z = boundary_sf[i * 3 + 2];
-        double face_xx = boundary_vf[i * 9 + 0];
-        double face_xy = boundary_vf[i * 9 + 1];
-        double face_xz = boundary_vf[i * 9 + 2];
-        double face_yx = boundary_vf[i * 9 + 3];
-        double face_yy = boundary_vf[i * 9 + 4];
-        double face_yz = boundary_vf[i * 9 + 5];
-        double face_zx = boundary_vf[i * 9 + 6];
-        double face_zy = boundary_vf[i * 9 + 7];
-        double face_zz = boundary_vf[i * 9 + 8];
-
-        // if not coupled
-        double coeff = boundary_scalar0[i] * boundary_scalar1[i];
-        sum_x += (sf_x * face_xx + sf_y * face_yx + sf_z * face_zx) * coeff;
-        sum_y += (sf_x * face_xy + sf_y * face_yy + sf_z * face_zy) * coeff;
-        sum_z += (sf_x * face_xz + sf_y * face_yz + sf_z * face_zz) * coeff;
-    }
-    double vol = volume[cell_index];
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + sum_x * sign;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + sum_y * sign;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + sum_z * sign;
+    atomicAdd(&H_pEqn[num_cells * 0 + internal_cellIndex], boundary_x * boundary_vf_x);
+    atomicAdd(&H_pEqn[num_cells * 1 + internal_cellIndex], boundary_y * boundary_vf_y);
+    atomicAdd(&H_pEqn[num_cells * 2 + internal_cellIndex], boundary_z * boundary_vf_z);
 }
 
-__global__ void fvm_laplacian_uncorrected_vector_internal(int num_cells, int num_faces,
-                                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                                          const double *scalar0, const double *scalar1, const double *weight,
-                                                          const double *magsf, const double *distance,
-                                                          const double sign, const double *A_csr_input, double *A_csr_output)
+__global__ void divide_vol_multi_rAU(int num_cells, const double *rAU, const double *volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-    int csr_dim = num_cells + num_faces;
-
-    double own_scalar0 = scalar0[index];
-    double own_scalar1 = scalar1[index];
-    double own_coeff = own_scalar0 * own_scalar1;
-
-    // fvm.upper() = deltaCoeffs.primitiveField()*gammaMagSf.primitiveField();
-    // fvm.negSumDiag();
-    double sum_diag = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_scalar0 = scalar0[neighbor_cell_id];
-        double nei_scalar1 = scalar1[neighbor_cell_id];
-        double nei_coeff = nei_scalar0 * nei_scalar1;
-        double gamma = w * (nei_coeff - own_coeff) + own_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
-
-        sum_diag += (-coeff);
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_scalar0 = scalar0[neighbor_cell_id];
-        double nei_scalar1 = scalar1[neighbor_cell_id];
-        double nei_coeff = nei_scalar0 * nei_scalar1;
-        double gamma = w * (own_coeff - nei_coeff) + nei_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[csr_dim * 0 + row_index + i] = A_csr_input[csr_dim * 0 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 1 + row_index + i] = A_csr_input[csr_dim * 1 + row_index + i] + coeff * sign;
-        A_csr_output[csr_dim * 2 + row_index + i] = A_csr_input[csr_dim * 2 + row_index + i] + coeff * sign;
-        sum_diag += (-coeff);
-    }
-    A_csr_output[csr_dim * 0 + row_index + diag_index] = A_csr_input[csr_dim * 0 + row_index + diag_index] + sum_diag * sign; // diag
-    A_csr_output[csr_dim * 1 + row_index + diag_index] = A_csr_input[csr_dim * 1 + row_index + diag_index] + sum_diag * sign; // diag
-    A_csr_output[csr_dim * 2 + row_index + diag_index] = A_csr_input[csr_dim * 2 + row_index + diag_index] + sum_diag * sign; // diag
-}
-
-__global__ void fvm_laplacian_uncorrected_vector_boundary(int num_cells, int num_faces, int num_boundary_cells,
-                                                          const int *csr_row_index, const int *csr_diag_index,
-                                                          const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                          const double *boundary_scalar0, const double *boundary_scalar1,
-                                                          const double *boundary_magsf, const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-                                                          const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output,
-                                                          double *ueqn_internal_coeffs, double *ueqn_boundary_coeffs)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
-
-    // OpenFoam code
-    // if (pvf.coupled())
-    // {
-    //     fvm.internalCoeffs()[patchi] =
-    //         pGamma*pvf.gradientInternalCoeffs(pDeltaCoeffs);
-    //     fvm.boundaryCoeffs()[patchi] =
-    //         -pGamma*pvf.gradientBoundaryCoeffs(pDeltaCoeffs);
-    // }
-    // else
-    // {
-    //     fvm.internalCoeffs()[patchi] = pGamma*pvf.gradientInternalCoeffs();
-    //     fvm.boundaryCoeffs()[patchi] = -
-    //         pGamma*pvf.gradientBoundaryCoeffs();
-    // }
-    double internal_coeffs_x = 0;
-    double internal_coeffs_y = 0;
-    double internal_coeffs_z = 0;
-    double boundary_coeffs_x = 0;
-    double boundary_coeffs_y = 0;
-    double boundary_coeffs_z = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double gamma = boundary_scalar0[i] * boundary_scalar1[i];
-        double gamma_magsf = gamma * boundary_magsf[i];
-        internal_coeffs_x += gamma_magsf * gradient_internal_coeffs[i * 3 + 0];
-        internal_coeffs_y += gamma_magsf * gradient_internal_coeffs[i * 3 + 1];
-        internal_coeffs_z += gamma_magsf * gradient_internal_coeffs[i * 3 + 2];
-        boundary_coeffs_x -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 0];
-        boundary_coeffs_y -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 1];
-        boundary_coeffs_z -= gamma_magsf * gradient_boundary_coeffs[i * 3 + 2];
-    }
+    // divide volume
+    double vol = volume[index];
+    double rAU_scalar = rAU[index];
 
-    ueqn_internal_coeffs[cell_index * 3 + 0] += internal_coeffs_x * sign;
-    ueqn_internal_coeffs[cell_index * 3 + 1] += internal_coeffs_y * sign;
-    ueqn_internal_coeffs[cell_index * 3 + 2] += internal_coeffs_z * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 0] += boundary_coeffs_x * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 1] += boundary_coeffs_y * sign;
-    ueqn_boundary_coeffs[cell_index * 3 + 2] += boundary_coeffs_z * sign;
-
-    A_csr_output[csr_dim * 0 + csr_index] = A_csr_input[csr_dim * 0 + csr_index] + internal_coeffs_x * sign;
-    A_csr_output[csr_dim * 1 + csr_index] = A_csr_input[csr_dim * 1 + csr_index] + internal_coeffs_y * sign;
-    A_csr_output[csr_dim * 2 + csr_index] = A_csr_input[csr_dim * 2 + csr_index] + internal_coeffs_z * sign;
-    b_output[num_cells * 0 + cell_index] = b_input[num_cells * 0 + cell_index] + boundary_coeffs_x * sign;
-    b_output[num_cells * 1 + cell_index] = b_input[num_cells * 1 + cell_index] + boundary_coeffs_y * sign;
-    b_output[num_cells * 2 + cell_index] = b_input[num_cells * 2 + cell_index] + boundary_coeffs_z * sign;
+    // multi rAU
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol * rAU_scalar;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol * rAU_scalar;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol * rAU_scalar;
 }
 
-__global__ void addBoundaryDiag(int num_cells, int num_boundary_cells,
-                                const int *csr_row_index, const int *csr_diag_index,
-                                const int *boundary_cell_offset, const int *boundary_cell_id,
-                                const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
-                                const double *psi, double *H)
+__global__ void correctBoundary_HbyA_fixedValueU(int num_boundary_surfaces, int num, int offset, 
+        const double *boundary_vf, double *boundary_output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    // addBoundaryDiag(boundaryDiagCmpt, cmpt); // add internal coeffs
-    // boundaryDiagCmpt.negate();
-    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
-    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
-    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
-
-    // addCmptAvBoundaryDiag(boundaryDiagCmpt);
-    double ave_internal = (internal_x + internal_y + internal_z) / 3;
+    int start_index = offset + index;
 
-    H[num_cells * 0 + cell_index] = (-internal_x + ave_internal) * psi[num_cells * 0 + cell_index];
-    H[num_cells * 1 + cell_index] = (-internal_y + ave_internal) * psi[num_cells * 1 + cell_index];
-    H[num_cells * 2 + cell_index] = (-internal_z + ave_internal) * psi[num_cells * 2 + cell_index];
+    boundary_output[num_boundary_surfaces * 0 + start_index] = boundary_vf[num_boundary_surfaces * 0 + start_index];
+    boundary_output[num_boundary_surfaces * 1 + start_index] = boundary_vf[num_boundary_surfaces * 1 + start_index];
+    boundary_output[num_boundary_surfaces * 2 + start_index] = boundary_vf[num_boundary_surfaces * 2 + start_index];
 }
 
-__global__ void permute_psi_d2h(int num_cells, const double *input, double *output)
+__global__ void ueqn_add_external_entry_kernal(int num, int bou_offset, 
+        int external_offset, const double *boundary_coeffs, double *external)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num)
         return;
 
-    output[index * 3 + 0] = input[num_cells * 0 + index];
-    output[index * 3 + 1] = input[num_cells * 1 + index];
-    output[index * 3 + 2] = input[num_cells * 2 + index];
+    int bou_start_index = bou_offset + index;
+    int external_start_index = external_offset + index;
+    external[external_start_index] = - boundary_coeffs[bou_start_index];
 }
 
-__global__ void permute_psi_h2d(int num_cells, const double *input, double *output)
+__global__ void ueqn_add_external_entry_kernal_processCyclic(int num, int bou_offset, 
+        int external_offset, const double *boundary_coeffs, double *external)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num)
         return;
 
-    output[num_cells * 0 + index] = input[index * 3 + 0];
-    output[num_cells * 1 + index] = input[index * 3 + 1];
-    output[num_cells * 2 + index] = input[index * 3 + 2];
+    int bou_start_index = bou_offset + index;
+    int external_start_index = external_offset + index;
+    external[external_start_index] = boundary_coeffs[bou_start_index];
 }
 
-__global__ void lduMatrix_H(int num_cells,
-                            const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                            const double *volume, const double *psi, const double *A_csr, const double *b,
-                            const double *ueqn_boundary_coeffs, double *H)
+__global__ void ueqn_ldu_to_csr_kernel(int nNz, const int *ldu_to_csr_index, 
+        const double *ldu, double *A_csr)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= nNz)
         return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double APsi_x = 0.;
-    double APsi_y = 0.;
-    double APsi_z = 0.;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
-        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
-        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        APsi_x += A_csr[row_index + i] * psi[num_cells * 0 + neighbor_cell_id];
-        APsi_y += A_csr[row_index + i] * psi[num_cells * 1 + neighbor_cell_id];
-        APsi_z += A_csr[row_index + i] * psi[num_cells * 2 + neighbor_cell_id];
-    }
-
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] - APsi_x + b[num_cells * 0 + index];
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] - APsi_y + b[num_cells * 1 + index];
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] - APsi_z + b[num_cells * 2 + index];
-
-    double vol = volume[index];
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] / vol;
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] / vol;
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] / vol;
+    
+    int lduIndex = ldu_to_csr_index[index];
+    double csrVal = ldu[lduIndex];
+    A_csr[nNz * 0 + index] = csrVal;
+    A_csr[nNz * 1 + index] = csrVal;
+    A_csr[nNz * 2 + index] = csrVal;
 }
 
-__global__ void addBoundarySource(int num_cells, int num_boundary_cells,
-                                  const int *csr_row_index, const int *csr_diag_index,
-                                  const int *boundary_cell_offset, const int *boundary_cell_id,
-                                  const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs,
-                                  const double *volume, double *H)
+__global__ void ueqn_add_boundary_diag_src_unCouple(int num_cells, int num_Nz, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, 
+        const double *internal_coeffs, const double *boundary_coeffs, const int *diagCSRIndex, 
+        double *A_csr, double *b)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num)
         return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double vol = volume[index];
-
-    H[num_cells * 0 + index] = H[num_cells * 0 + index] + ueqn_boundary_coeffs[cell_index * 3 + 0] / vol;
-    H[num_cells * 1 + index] = H[num_cells * 1 + index] + ueqn_boundary_coeffs[cell_index * 3 + 1] / vol;
-    H[num_cells * 2 + index] = H[num_cells * 2 + index] + ueqn_boundary_coeffs[cell_index * 3 + 2] / vol;
+    
+    int startIndex = offset + index;
+    int cellIndex = face2Cells[startIndex];
+    int diagIndex = diagCSRIndex[cellIndex];
+
+    double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + startIndex];
+    double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + startIndex];
+    double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + startIndex];
+
+    double boundaryCoeffx = boundary_coeffs[num_boundary_surfaces * 0 + startIndex];
+    double boundaryCoeffy = boundary_coeffs[num_boundary_surfaces * 1 + startIndex];
+    double boundaryCoeffz = boundary_coeffs[num_boundary_surfaces * 2 + startIndex];
+
+    atomicAdd(&A_csr[num_Nz * 0 + diagIndex], internalCoeffx);
+    atomicAdd(&A_csr[num_Nz * 1 + diagIndex], internalCoeffy);
+    atomicAdd(&A_csr[num_Nz * 2 + diagIndex], internalCoeffz);
+
+    atomicAdd(&b[num_cells * 0 + cellIndex], boundaryCoeffx);
+    atomicAdd(&b[num_cells * 1 + cellIndex], boundaryCoeffy);
+    atomicAdd(&b[num_cells * 2 + cellIndex], boundaryCoeffz);
 }
 
-__global__ void addAveInternaltoDiag(int num_cells, int num_boundary_cells,
-                                     const int *csr_row_index, const int *csr_diag_index,
-                                     const int *boundary_cell_offset, const int *boundary_cell_id,
-                                     const double *ueqn_internal_coeffs, const double *ueqn_boundary_coeffs, double *A)
+__global__ void ueqn_add_boundary_diag_src_couple(int num_cells, int num_Nz, int num_boundary_surfaces, 
+        int num, int offset, const int *face2Cells, const double *internal_coeffs, 
+        const int *diagCSRIndex, double *A_csr)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num)
         return;
-
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    double internal_x = ueqn_internal_coeffs[cell_index * 3 + 0];
-    double internal_y = ueqn_internal_coeffs[cell_index * 3 + 1];
-    double internal_z = ueqn_internal_coeffs[cell_index * 3 + 2];
-
-    double ave_internal = (internal_x + internal_y + internal_z) / 3;
-
-    A[cell_index] = ave_internal;
+    
+    int startIndex = offset + index;
+    int cellIndex = face2Cells[startIndex];
+    int diagIndex = diagCSRIndex[cellIndex];
+
+    double internalCoeffx = internal_coeffs[num_boundary_surfaces * 0 + startIndex];
+    double internalCoeffy = internal_coeffs[num_boundary_surfaces * 1 + startIndex];
+    double internalCoeffz = internal_coeffs[num_boundary_surfaces * 2 + startIndex];
+
+    atomicAdd(&A_csr[num_Nz * 0 + diagIndex], internalCoeffx);
+    atomicAdd(&A_csr[num_Nz * 1 + diagIndex], internalCoeffy);
+    atomicAdd(&A_csr[num_Nz * 2 + diagIndex], internalCoeffz);
 }
 
-__global__ void addDiagDivVolume(int num_cells, const int *csr_row_index,
-                                 const int *csr_diag_index, const double *A_csr, const double *volume,
-                                 double *ueqn_internal_coeffs, const double *A_input, double *A_output)
+__global__ void ueqn_divide_cell_volume_vec(int num_cells, const double* volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
-
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-    int csr_index = row_index + diag_index;
-
+    
     double vol = volume[index];
 
-    A_output[index] = (A_input[index] + A_csr[csr_index] - ueqn_internal_coeffs[index * 3]) / vol;
+    output[num_cells * 0 + index] = output[num_cells * 0 + index] / vol;
+    output[num_cells * 1 + index] = output[num_cells * 1 + index] / vol;
+    output[num_cells * 2 + index] = output[num_cells * 2 + index] / vol;
 }
 
-__global__ void ueqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, const double *boundary_phi, double *internal_coeffs,
-                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
-                                                  double *laplac_boundary_coeffs, const int *U_patch_type,
-                                                  const double *boundary_velocity, const double *boundary_deltaCoeffs)
+__global__ void ueqn_calculate_turbulence_k_Smagorinsky(int num_cells, 
+        const double *grad_U_tsr, const double *volume, double Ce, double Ck, 
+        double *delta, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
+    if (index >= num_cells)
         return;
-
-    int patchIndex = U_patch_type[index];
-    if (patchIndex == 0) { // zeroGradient
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 1.; // valueInternalCoeffs = 1.
-        internal_coeffs[index * 3 + 1] = bouPhi * 1.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 1.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
-        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
-        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
-        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
-        laplac_internal_coeffs[index * 3 + 1] = 0.;
-        laplac_internal_coeffs[index * 3 + 2] = 0.;
-        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
-        laplac_boundary_coeffs[index * 3 + 1] = 0.;
-        laplac_boundary_coeffs[index * 3 + 2] = 0.;
-    } else if (patchIndex == 1) { // fixedValue
-        double bouDeltaCoeffs = boundary_deltaCoeffs[index];
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
-        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * boundary_velocity[index * 3 + 0]; // valueBoundaryCoeffs = boundaryValue
-        boundary_coeffs[index * 3 + 1] = -bouPhi * boundary_velocity[index * 3 + 1];
-        boundary_coeffs[index * 3 + 2] = -bouPhi * boundary_velocity[index * 3 + 2];
-        laplac_internal_coeffs[index * 3 + 0] = -1 * bouDeltaCoeffs; // gradientInternalCoeffs = -1 * boundaryDeltaCoeffs
-        laplac_internal_coeffs[index * 3 + 1] = -1 * bouDeltaCoeffs;
-        laplac_internal_coeffs[index * 3 + 2] = -1 * bouDeltaCoeffs;
-        laplac_boundary_coeffs[index * 3 + 0] = bouDeltaCoeffs * boundary_velocity[index * 3 + 0]; // gradientBoundaryCoeffs = boundaryDeltaCoeffs * boundaryValue
-        laplac_boundary_coeffs[index * 3 + 1] = bouDeltaCoeffs * boundary_velocity[index * 3 + 1];
-        laplac_boundary_coeffs[index * 3 + 2] = bouDeltaCoeffs * boundary_velocity[index * 3 + 2];
-    } else if (patchIndex == 2) { // empty
-        double bouPhi = boundary_phi[index];
-        internal_coeffs[index * 3 + 0] = bouPhi * 0.; // valueInternalCoeffs = 0.
-        internal_coeffs[index * 3 + 1] = bouPhi * 0.;
-        internal_coeffs[index * 3 + 2] = bouPhi * 0.;
-        boundary_coeffs[index * 3 + 0] = -bouPhi * 0.; // valueBoundaryCoeffs = 0.
-        boundary_coeffs[index * 3 + 1] = -bouPhi * 0.;
-        boundary_coeffs[index * 3 + 2] = -bouPhi * 0.;
-        laplac_internal_coeffs[index * 3 + 0] = 0.; // gradientInternalCoeffs = 0.
-        laplac_internal_coeffs[index * 3 + 1] = 0.;
-        laplac_internal_coeffs[index * 3 + 2] = 0.;
-        laplac_boundary_coeffs[index * 3 + 0] = 0.; // gradientBoundaryCoeffs = 0.
-        laplac_boundary_coeffs[index * 3 + 1] = 0.;
-        laplac_boundary_coeffs[index * 3 + 2] = 0.;
-    }
-    // TODO implement coupled conditions
+    
+    double vol = volume[index];
+    double oneThird = (1. / 3.);
+
+    double del = pow(vol, oneThird);
+
+    // D = 0.5*(T+T^T)
+    double D_xx = grad_U_tsr[num_cells * 0 + index];
+    double D_xy = 0.5 * (grad_U_tsr[num_cells * 1 + index] + grad_U_tsr[num_cells * 3 + index]);
+    double D_xz = 0.5 * (grad_U_tsr[num_cells * 2 + index] + grad_U_tsr[num_cells * 6 + index]);
+    double D_yy = grad_U_tsr[num_cells * 4 + index];
+    double D_yz = 0.5 * (grad_U_tsr[num_cells * 5 + index] + grad_U_tsr[num_cells * 7 + index]);
+    double D_zz = grad_U_tsr[num_cells * 8 + index];
+
+    // dev(D)
+    double trace = D_xx + D_yy + D_zz;
+    double dev_D_xx = D_xx - oneThird * trace;
+    double dev_D_yy = D_yy - oneThird * trace;
+    double dev_D_zz = D_zz - oneThird * trace;
+
+    // scalar a
+    double a = Ce / del;
+    // scalar b
+    double b = 2 * oneThird * trace;
+    // scalar c
+    double c = 2 * Ck * del * (dev_D_xx * D_xx + dev_D_yy * D_yy + dev_D_zz * D_zz 
+                                    + D_xy * D_xy * 2 + D_xz * D_xz * 2 + D_yz * D_yz * 2);
+    
+    double sqrt_result = (-b + pow(b * b + 4 * a * c, 0.5)) / (2 * a);
+    output[index] = sqrt_result * sqrt_result;
+    delta[index] = del;
 }
 
-__global__ void ueqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells,
-                                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                       const double *velocity, double *boundary_velocity, const int *U_patch_type)
+__global__ void ueqn_calculate_turbulence_epsilon_Smagorinsky(int num_cells,
+        const double *k, const double *delta, double Ce, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_cells)
         return;
+    
+    output[index] = Ce * pow(k[index], 1.5) / delta[index];
+}
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
+void dfUEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+  this->stream = dataBase_.stream;
+  this->mode_string = mode_string;
+  this->setting_path = setting_path;
+  UxSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
+  UySolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
+  UzSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
+}
 
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        int patchIndex = U_patch_type[i];
-        switch (patchIndex)
-        {
-            case 0: // zeroGradient
-            {
-                boundary_velocity[i * 3 + 0] = velocity[cell_index];
-                boundary_velocity[i * 3 + 1] = velocity[num_cells * 1 + cell_index];
-                boundary_velocity[i * 3 + 2] = velocity[num_cells * 2 + cell_index];
-                break;
-            }
-            case 1:
-                break;
-            case 2:
-                break;
-            // TODO implement coupled conditions
+void dfUEqn::setConstantFields(const std::vector<int> patch_type) {
+    this->patch_type = patch_type;
+
+    int offset = 0;
+    for (int i = 0; i < dataBase_.num_patches; i++) {
+        if (patch_type[i] == boundaryConditions::processor
+                || patch_type[i] == boundaryConditions::processorCyclic) {
+            dataBase_.patchSizeOffset.push_back(offset);
+            offset += dataBase_.patch_size[i] * 2;
+        } else {
+            dataBase_.patchSizeOffset.push_back(offset);
+            offset += dataBase_.patch_size[i];
         }
     }
+
 }
 
-// constructor
-dfUEqn::dfUEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile)
-    : dataBase_(dataBase)
-{
-    stream = dataBase_.stream;
-
-    UxSolver = new AmgXSolver(modeStr, cfgFile);
-    UySolver = new AmgXSolver(modeStr, cfgFile);
-    UzSolver = new AmgXSolver(modeStr, cfgFile);
-
-    num_cells = dataBase_.num_cells;
-    cell_bytes = dataBase_.cell_bytes;
-    num_faces = dataBase_.num_faces;
-    cell_vec_bytes = dataBase_.cell_vec_bytes;
-    csr_value_vec_bytes = dataBase_.csr_value_vec_bytes;
-    num_boundary_cells = dataBase_.num_boundary_cells;
-    num_surfaces = dataBase_.num_surfaces;
-
-    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
-    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
-    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
-
-    h_A_csr = new double[(num_cells + num_faces) * 3];
-    h_b = new double[num_cells * 3];
-    cudaMallocHost(&h_psi, cell_vec_bytes);
-    cudaMallocHost(&h_H, cell_vec_bytes);
-    cudaMallocHost(&h_A, cell_bytes);
-
-    checkCudaErrors(cudaMalloc((void **)&d_A_csr, csr_value_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_b, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_psi_permute, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_H, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_H_permute, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_A, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_ueqn_internal_coeffs, cell_vec_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_ueqn_boundary_coeffs, cell_vec_bytes));
+void dfUEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_nu_eff, dataBase_.cell_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_delta, dataBase_.cell_value_bytes));
+
+  checkCudaErrors(cudaMalloc((void**)&d_rho_nueff, dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes));
+#endif
+  checkCudaErrors(cudaMalloc((void**)&d_u_host_order, dataBase_.cell_value_vec_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_nu_eff , dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMallocHost((void**)&h_A_pEqn , dataBase_.cell_value_bytes));
+  checkCudaErrors(cudaMallocHost((void**)&h_H_pEqn , dataBase_.cell_value_vec_bytes));
+
+  // getter for h_nu_eff
+  fieldPointerMap["h_nu_eff"] = h_nu_eff;
+}
+        
+void dfUEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+  // thermophysical fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+  // intermediate fields
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes));
+  // boundary coeff fields
+  checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+#endif
+  // intermediate boundary
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_u_host_order, dataBase_.boundary_surface_value_vec_bytes));
+  // computed on CPU, used on GPU, need memcpyh2d
+  checkCudaErrors(cudaMallocHost((void**)&h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes));
+
+  // getter for h_boundary_nu_eff
+  fieldPointerMap["h_boundary_nu_eff"] = h_boundary_nu_eff;
 }
 
-void dfUEqn::fvm_ddt(double *vector_old)
-{
-    // Copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_velocity_old, vector_old, cell_vec_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_ddt_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, dataBase_.rdelta_t,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_velocity_old, d_A_csr, d_b, d_A_csr, d_b, d_psi);
+void dfUEqn::createNonConstantLduAndCsrFields() {
+  checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
+  d_lower = d_ldu;
+  d_diag = d_ldu + dataBase_.num_surfaces;
+  d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
+  d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+  checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes));
+#ifndef STREAM_ALLOCATOR
+  checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_ldu_solve, dataBase_.csr_value_bytes));
+  d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+  checkCudaErrors(cudaMalloc((void**)&d_source_solve, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes));
+#endif
+  checkCudaErrors(cudaMalloc((void**)&d_A_pEqn, dataBase_.cell_value_bytes)); // TODO: delete redundant variables
+  checkCudaErrors(cudaMalloc((void**)&d_H_pEqn, dataBase_.cell_value_vec_bytes));
+  checkCudaErrors(cudaMalloc((void**)&d_H_pEqn_perm, dataBase_.cell_value_vec_bytes));
 }
 
-void dfUEqn::fvm_div(double *boundary_pressure_init, double *boundary_velocity_init,
-                     double *boundary_nuEff_init, double *boundary_rho_init)
+void dfUEqn::initNonConstantFieldsInternal(const double *u, const double *boundary_u)
 {
-    // copy and permutate boundary variable
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_velocity_init, boundary_velocity_init, dataBase_.boundary_face_vec_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_pressure_init, boundary_pressure_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_nuEff_init, boundary_nuEff_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_rho_init, boundary_rho_init, dataBase_.boundary_face_bytes, cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMemcpyAsync(d_u_host_order, u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(d_boundary_u_host_order, boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u);
+    permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u);
+}
 
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    boundaryPermutation<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_bouPermedIndex, dataBase_.d_boundary_pressure_init,
-            dataBase_.d_boundary_velocity_init, dataBase_.d_boundary_pressure, dataBase_.d_boundary_velocity, 
-            dataBase_.d_boundary_nuEff_init, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho_init, dataBase_.d_boundary_rho);
+void dfUEqn::initNonConstantFieldsBoundary() {
+    // update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
+    //        dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_u, dataBase_.d_boundary_delta_coeffs,
+    //        d_value_internal_coeffs, d_value_boundary_coeffs,
+    //        d_gradient_internal_coeffs, 
+    // );
+}
 
-    // initialize boundary coeffs (must after the update of d_boundary_velocity)
-    threads_per_block = 1024;
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    ueqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_boundary_phi,
-                                                                                         dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs,
-                                                                                         dataBase_.d_laplac_internal_coeffs, dataBase_.d_laplac_boundary_coeffs,
-                                                                                         dataBase_.d_boundary_UpatchType, dataBase_.d_boundary_velocity, dataBase_.d_boundary_deltaCoeffs);
+void dfUEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
+        checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
+        checkCudaErrors(cudaGraphDestroy(graph_post));
+    }
+#endif
+}
 
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                        d_A_csr_row_index, d_A_csr_diag_index,
-                                                                        dataBase_.d_weight, dataBase_.d_phi, d_A_csr, d_b, d_A_csr, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                        d_A_csr_row_index, d_A_csr_diag_index,
-                                                                        dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                        dataBase_.d_internal_coeffs, dataBase_.d_boundary_coeffs, d_A_csr, d_b, d_A_csr, d_b,
-                                                                        d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
+void dfUEqn::preProcessForRhoEqn(const double *h_rho, const double *h_phi, const double *h_boundary_phi) {
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_rho, h_rho, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+  checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
 }
 
-void dfUEqn::fvc_grad(double *pressure)
-{
-    // Copy the host input array in host memory to the device input array in device memory
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_pressure, pressure, cell_bytes, cudaMemcpyHostToDevice, stream));
+void dfUEqn::preProcess(const double *h_u, const double *h_boundary_u, const double *h_p, const double *h_boundary_p, 
+        const double *h_nu_eff, const double *h_boundary_nu_eff) {
 
-    // launch cuda kernel
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_internal_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                              d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                              dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_pressure, d_b, d_b);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_boundary_face<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                              dataBase_.d_boundary_face_vector, dataBase_.d_boundary_pressure, d_b, d_b);
 }
 
-void dfUEqn::fvc_grad_vector()
-{
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                                d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                                dataBase_.d_face_vector, dataBase_.d_velocity_old, dataBase_.d_weight, dataBase_.d_volume, dataBase_.d_grad);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                                dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_velocity,
-                                                                                dataBase_.d_volume, dataBase_.d_grad, dataBase_.d_grad_boundary_init);
-
-    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_cells,
-                                                                                   dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
-                                                                                   dataBase_.d_grad_boundary_init, dataBase_.d_grad_boundary, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_velocity_old,
-                                                                                   dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
+void dfUEqn::process() {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_nu_eff, dataBase_.cell_value_bytes, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_grad_u, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_delta, dataBase_.cell_value_bytes, dataBase_.stream));
+
+        checkCudaErrors(cudaMallocAsync((void**)&d_rho_nueff, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_fvc_output, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+
+        // thermophysical fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rho_nueff, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        // boundary coeff fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+
+        checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_b, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_ldu_solve, dataBase_.csr_value_bytes, dataBase_.stream));
+        d_extern_solve = d_ldu_solve + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+        checkCudaErrors(cudaMallocAsync((void**)&d_source_solve, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs_solve, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+
+#endif
+
+        // checkCudaErrors(cudaMemcpyAsync(d_u_host_order, dataBase_.h_u, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(d_boundary_u_host_order, dataBase_.h_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, dataBase_.h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, dataBase_.h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(d_nu_eff, h_nu_eff, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+        // checkCudaErrors(cudaMemcpyAsync(d_boundary_nu_eff, h_boundary_nu_eff, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+
+        checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
+        checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_A_pEqn, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_H_pEqn, 0, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+
+        checkCudaErrors(cudaMemsetAsync(d_grad_u, 0, dataBase_.cell_value_tsr_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_grad_u, 0, dataBase_.boundary_surface_value_tsr_bytes, dataBase_.stream));
+
+        checkCudaErrors(cudaMemsetAsync(d_delta, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+
+        // permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u);
+        // permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u);
+        update_boundary_coeffs_vector(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.num_patches,
+            dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_u, dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_weight,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+        fvm_ddt_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+                dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_u, dataBase_.d_volume,
+                d_diag, d_source, 1.);
+        fvm_div_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_phi, dataBase_.d_weight,
+                d_lower, d_upper, d_diag, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_phi, d_value_internal_coeffs, d_value_boundary_coeffs,
+                d_internal_coeffs, d_boundary_coeffs, 1.);
+        // field_multiply_scalar(dataBase_.stream,
+        //        dataBase_.num_cells, dataBase_.d_rho, d_nu_eff, d_rho_nueff, // end for internal
+        //        dataBase_.num_boundary_surfaces, dataBase_.d_boundary_rho, d_boundary_nu_eff, d_boundary_rho_nueff);
+        fvm_laplacian_vector(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+               dataBase_.d_owner, dataBase_.d_neighbor,
+               dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, dataBase_.d_mu,
+               d_lower, d_upper, d_diag, // end for internal
+               dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+               dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_mu,
+               d_gradient_internal_coeffs, d_gradient_boundary_coeffs,
+               d_internal_coeffs, d_boundary_coeffs, -1);
+        fvc_grad_vector(dataBase_.stream, dataBase_.nccl_comm,
+                dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.neighbProcNo.data(), dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_u, d_grad_u,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_u, dataBase_.d_boundary_sf, dataBase_.d_boundary_weight, 
+                dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_u, dataBase_.cyclicNeighbor.data(),
+                dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs);
+
+        // **if use turbulence model**
+        // calculate k & epsilon
+        getTurbulenceKEpsilon_Smagorinsky(dataBase_.stream, dataBase_.num_cells, dataBase_.num_boundary_surfaces, d_grad_u, dataBase_.d_volume, 
+                d_delta, dataBase_.d_turbulence_k, dataBase_.d_turbulence_epsilon);
+        // calculate nut
+        // **end use turbulence model**
+
+        scale_dev2T_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.d_mu, d_grad_u, // end for internal
+                dataBase_.num_boundary_surfaces, dataBase_.d_boundary_mu, d_boundary_grad_u);
+        fvc_div_cell_tensor(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, d_grad_u, d_source, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, d_boundary_grad_u, dataBase_.d_boundary_sf, dataBase_.d_volume);
+        
+        checkCudaErrors(cudaMemcpyAsync(d_ldu_solve, d_ldu, dataBase_.csr_value_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_source_solve, d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_internal_coeffs_solve, d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+        checkCudaErrors(cudaMemcpyAsync(d_boundary_coeffs_solve, d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToDevice, dataBase_.stream));
+
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, d_source_solve,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, -1.);
+        getrAU(dataBase_.stream, dataBase_.nccl_comm, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+                dataBase_.neighbProcNo.data(), dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_extropolated.data(),
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_delta_coeffs, d_internal_coeffs, dataBase_.d_volume, d_diag, 
+                dataBase_.d_rAU, dataBase_.d_boundary_rAU);
+#ifndef DEBUG_CHECK_LDU   
+        ueqn_ldu_to_csr(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.num_Nz,
+            dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.d_diag_to_csr_index, 
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_u, dataBase_.d_boundary_u,
+            d_ldu_solve, d_extern_solve, d_source_solve, d_internal_coeffs_solve, d_boundary_coeffs_solve, dataBase_.cyclicNeighbor.data(), 
+            dataBase_.patchSizeOffset.data(), d_A, d_b);
+#endif
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
+#endif
+    TICK_END_EVENT(UEqn assembly);
+
+    TICK_START_EVENT;
+#ifndef DEBUG_CHECK_LDU
+    solve();
+#endif
+    TICK_END_EVENT(UEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+        TICK_START_EVENT;
+        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
+                dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, 
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data());
+        vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+        TICK_END_EVENT(UEqn post process correctBC);
+
+        TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+        // free
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_delta, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_rho_nueff, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_fvc_output, dataBase_.stream));
+    
+        // thermophysical fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_nu_eff, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_grad_u, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_rho_nueff, dataBase_.stream));
+        // boundary coeff fields
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+    
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_b, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_ldu_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_source_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs_solve, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs_solve, dataBase_.stream));
+#endif
+        TICK_END_EVENT(UEqn post process free);
+
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
+        post_graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
+#endif
+    sync();
 }
 
-void dfUEqn::dev2T()
+void dfUEqn::sync()
 {
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, dataBase_.d_grad);
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
 
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    dev2_t_tensor<<<blocks_per_grid, threads_per_block, 0, stream>>>(dataBase_.num_boundary_faces, dataBase_.d_grad_boundary);
+void dfUEqn::solve() {
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_u, d_b);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + dataBase_.num_Nz, dataBase_.d_u + dataBase_.num_cells, d_b + dataBase_.num_cells);
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A + 2 * dataBase_.num_Nz, dataBase_.d_u + 2 * dataBase_.num_cells, d_b + 2 * dataBase_.num_cells);
+    num_iteration++;
 }
 
-void dfUEqn::fvc_div_tensor(const double *nuEff)
-{
-    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_nuEff, nuEff, cell_bytes, cudaMemcpyHostToDevice, stream));
-    size_t threads_per_block = 512;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-                                                                               d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                               dataBase_.d_nuEff, dataBase_.d_rho_new, dataBase_.d_face_vector, dataBase_.d_grad, dataBase_.d_weight,
-                                                                               dataBase_.d_volume, 1., d_b, d_b);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_div_tensor_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells,
-                                                                               dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                               dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face_vector, dataBase_.d_grad_boundary,
-                                                                               dataBase_.d_volume, 1., d_b, d_b);
+void dfUEqn::postProcess() {
+    // postProcess of dfUEqn can not be moved to the end of dfUEqn::process(),
+    // because dataBase_.d_u is modified in dfpEqn and we only need the result of the last change
+    // copy u and boundary_u to host
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, d_u_host_order);
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_u, d_u_host_order, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
 }
 
-void dfUEqn::fvm_laplacian()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces,
-                                                                                                 d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index, dataBase_.d_rho_new, dataBase_.d_nuEff, dataBase_.d_weight,
-                                                                                                 dataBase_.d_face, dataBase_.d_deltaCoeffs, -1., d_A_csr, d_A_csr);
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_vector_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_faces, num_boundary_cells,
-                                                                                                 d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                                 dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_face, dataBase_.d_laplac_internal_coeffs,
-                                                                                                 dataBase_.d_laplac_boundary_coeffs, -1., d_A_csr, d_b, d_A_csr, d_b, d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs);
+void dfUEqn::A(double *Psi) {
+    fvMtx_A(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_boundary_face_cell, d_internal_coeffs, dataBase_.d_volume, d_diag, d_A_pEqn);
+    checkCudaErrors(cudaMemcpyAsync(h_A_pEqn, d_A_pEqn, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+    // TODO: correct Boundary conditions
+    memcpy(Psi, h_A_pEqn, dataBase_.cell_value_bytes);
 }
 
-void dfUEqn::A(double *Psi)
+void dfUEqn::getrAU(cudaStream_t stream, ncclComm_t comm, int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *neighbor_peer, int num_patches, const int *patch_size, const int *patch_type,
+        const int *boundary_cell_face, const double *boundary_delta_coeffs, const double *internal_coeffs, const double *volume, 
+        const double *diag, double *rAU, double *boundary_rAU)
 {
-    checkCudaErrors(cudaMemsetAsync(d_A, 0, cell_bytes, stream));
+    checkCudaErrors(cudaMemcpyAsync(rAU, diag, num_cells * sizeof(double), cudaMemcpyDeviceToDevice, stream));
 
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    addAveInternaltoDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
-                                                                            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                            d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs, d_A);
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    addAveInternaltoDiagUeqn<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coeffs, rAU);
+    
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    addDiagDivVolume<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_diag_index, d_A_csr,
-                                                                        dataBase_.d_volume, d_ueqn_internal_coeffs, d_A, d_A);
+    divide_cell_volume_scalar_reverse<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, rAU);
 
-    checkCudaErrors(cudaMemcpyAsync(h_A, d_A, cell_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    memcpy(Psi, h_A, cell_bytes);
+    correct_boundary_conditions_scalar(stream, comm, neighbor_peer, num_boundary_surfaces, num_patches, patch_size, patch_type, boundary_delta_coeffs,
+            boundary_cell_face, rAU, boundary_rAU, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
 }
 
-void dfUEqn::H(double *Psi)
+void dfUEqn::getTurbulenceKEpsilon_Smagorinsky(cudaStream_t stream, int num_cells, int num_boundary_surfaces, 
+        const double *grad_U_tsr, const double *volume, 
+        double *delta, double *turbulence_k, double *turbulence_epsilon)
 {
-    checkCudaErrors(cudaMemsetAsync(d_H, 0, cell_bytes * 3, stream));
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, d_A_csr_row_index, d_A_csr_diag_index,
-                                                                       dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                       d_ueqn_internal_coeffs, d_ueqn_boundary_coeffs,
-                                                                       d_psi, d_H);
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    ueqn_calculate_turbulence_k_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, grad_U_tsr, volume,
+            1.048, 0.094, delta, turbulence_k);
+    
+    ueqn_calculate_turbulence_epsilon_Smagorinsky<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, turbulence_k, 
+            delta, 1.048, turbulence_epsilon);
+}
 
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-                                                                   dataBase_.d_volume, d_psi, d_A_csr, d_b, d_ueqn_boundary_coeffs, d_H);
+void dfUEqn::UEqnGetHbyA(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer, 
+        int num_cells, int num_surfaces, int num_boundary_surfaces, 
+        const int *lowerAddr, const int *upperAddr, const double *volume, const double *u,
+        int num_patches, const int *patch_size, const int *patch_type, const int *patch_type_U,
+        const int *boundary_cell_face, const double *internal_coffs, const double *boundary_coeffs, const double *boundary_weight,
+        const double *lower, const double *upper, const double *source, const double *psi, 
+        const double *rAU, const double *boundary_rAU, const double *boundary_u,
+        const int *cyclicNeighbor, const int *patchSizeOffset,
+        double *HbyA, double *boundary_HbyA)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    checkCudaErrors(cudaMemcpyAsync(HbyA, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+
+    ueqn_addBoundaryDiag<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_surfaces, boundary_cell_face, 
+            internal_coffs, psi, HbyA);
+
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    ueqn_lduMatrix_H<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
+            lower, upper, psi, HbyA);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type[i] == boundaryConditions::extrapolated) {
+            ueqn_addBoundarySrc_unCoupled<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
+                    num_boundary_surfaces, boundary_cell_face, boundary_coeffs, HbyA);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            ueqn_addBoundarySrc_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset, 
+                    num_boundary_surfaces, boundary_cell_face, boundary_coeffs, boundary_u, HbyA);
+            offset += patch_size[i] * 2;
+            continue;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            ueqn_addBoundarySrc_cyclic<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, patch_size[i], offset,
+                    patchSizeOffset[cyclicNeighbor[i]], num_boundary_surfaces, boundary_cell_face, boundary_coeffs, u, HbyA);
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
 
+    // divide volume and correct boundary conditions
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_H, d_H_permute);
-
-    checkCudaErrors(cudaMemcpyAsync(h_H, d_H_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    memcpy(Psi, h_H, cell_vec_bytes);
+    ueqn_divide_cell_volume_vec<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, HbyA);
+    correct_boundary_conditions_vector(stream, comm, neighbor_peer, num_boundary_surfaces, num_cells, num_patches, 
+            patch_size, patch_type, boundary_weight, boundary_cell_face, HbyA, boundary_HbyA,
+            cyclicNeighbor, patchSizeOffset);
+
+    // multi rAU
+    scalar_field_multiply_vector_field(stream, num_cells, rAU, HbyA, HbyA, num_boundary_surfaces, boundary_rAU, 
+            boundary_HbyA, boundary_HbyA);
+
+    // constrainHbyA
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just non-coupled patch type now
+        if (patch_type_U[i] == boundaryConditions::fixedValue) {
+            correctBoundary_HbyA_fixedValueU<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_u, boundary_HbyA);
+        }
+        offset += patch_size[i];
+    }
 }
 
-void dfUEqn::initializeTimeStep()
+void dfUEqn::getHbyA()
 {
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, csr_value_vec_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_vec_bytes, stream));
+    UEqnGetHbyA(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+            dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_volume, dataBase_.d_u, 
+            dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_extropolated.data(),
+            patch_type.data(),dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_weight,
+            d_lower, d_upper, d_source, dataBase_.d_u, dataBase_.d_rAU, dataBase_.d_boundary_rAU, 
+            dataBase_.d_boundary_u, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(),
+            dataBase_.d_HbyA, dataBase_.d_boundary_HbyA);
 }
 
-void dfUEqn::checkValue(bool print)
+void dfUEqn::ueqn_ldu_to_csr(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surface, int num_Nz, 
+        const int* boundary_cell_face, const int *ldu_to_csr_index, const int *diag_to_csr_index,
+        int num_patches, const int *patch_size, const int *patch_type, const double *vf, const double *boundary_vf,
+        const double *ldu, double *external, const double *source, const double *internal_coeffs, const double *boundary_coeffs,
+        const int *cyclicNeighbor, const int *patchSizeOffset, double *A, double *b)
 {
-    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr, csr_value_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-
-    // Synchronize stream
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_A_csr[i], h_A_csr[i + (num_faces + num_cells)], h_A_csr[i + 2 * (num_faces + num_cells)]);
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b[%d]: (%.10lf, %.10lf, %.10lf)\n", i, h_b[i], h_b[i + num_cells], h_b[i + 2 * num_cells]);
+    // add external to ldu
+    int bou_offset = 0, ext_offset = 0;
+    size_t threads_per_block, blocks_per_grid;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            threads_per_block = 64;
+            blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+            ueqn_add_external_entry_kernal<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    ext_offset, boundary_coeffs, external);
+            bou_offset += patch_size[i] * 2;
+            ext_offset += patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            threads_per_block = 64;
+            blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+            ueqn_add_external_entry_kernal<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], bou_offset, 
+                    ext_offset, boundary_coeffs, external);
+            bou_offset += patch_size[i];
+            ext_offset += patch_size[i];
+        } else {
+            bou_offset += patch_size[i];
+        }
     }
-
-    char *input_file = "of_output.txt";
-    FILE *fp = fopen(input_file, "rb+");
-    if (fp == NULL)
-    {
-        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
+    
+    // construct csr matrix and RHS vec
+    threads_per_block = 1024;
+    blocks_per_grid = (num_Nz + threads_per_block - 1) / threads_per_block;
+    checkCudaErrors(cudaMemcpyAsync(b, source, num_cells * 3 * sizeof(double), cudaMemcpyDeviceToDevice, stream));
+    ueqn_ldu_to_csr_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_Nz, ldu_to_csr_index, ldu, A);
+
+    // add coeff to source and diagnal
+    bou_offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 64;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            ueqn_add_boundary_diag_src_couple<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_Nz, num_boundary_surface, 
+                    patch_size[i], bou_offset, boundary_cell_face, internal_coeffs, diag_to_csr_index, A);
+            bou_offset += patch_size[i] * 2;
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            ueqn_add_boundary_diag_src_couple<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_Nz, num_boundary_surface, 
+                    patch_size[i], bou_offset, boundary_cell_face, internal_coeffs, diag_to_csr_index, A);
+            bou_offset += patch_size[i];
+        } else {
+            ueqn_add_boundary_diag_src_unCouple<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_Nz, num_boundary_surface, 
+                    patch_size[i], bou_offset, boundary_cell_face, internal_coeffs, boundary_coeffs, diag_to_csr_index, A, b);
+            bou_offset += patch_size[i];
+        }
     }
-    int readfile = 0;
-    double *of_b = new double[3 * num_cells];
-    double *of_A = new double[3 * (num_faces + num_cells)];
-    readfile = fread(of_b, num_cells * 3 * sizeof(double), 1, fp);
-    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double) * 3, 1, fp);
+}
 
-    std::vector<double> h_A_of_init_vec(3 * (num_cells + num_faces));
-    std::copy(of_A, of_A + (num_cells + num_faces) * 3, h_A_of_init_vec.begin());
+void dfUEqn::H(double *Psi) {
+    fvMtx_H(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_volume,
+            dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_extropolated.data(),
+            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs,
+            d_lower, d_upper, d_source, dataBase_.d_u, d_H_pEqn, d_H_pEqn_perm);
+    checkCudaErrors(cudaMemcpyAsync(h_H_pEqn, d_H_pEqn_perm, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+    // TODO: correct Boundary conditions
+    memcpy(Psi, h_H_pEqn, dataBase_.cell_value_vec_bytes);
+}
 
-    std::vector<double> h_A_of_vec_perm(3 * (num_faces + num_cells), 0);
-    for (int i = 0; i < num_faces + num_cells; i++)
-    {
-        h_A_of_vec_perm[i] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i]];
-        h_A_of_vec_perm[i + num_faces + num_cells] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + num_faces + num_cells];
-        h_A_of_vec_perm[i + 2 * (num_faces + num_cells)] = h_A_of_init_vec[dataBase_.tmpPermutatedList[i] + 2 * (num_faces + num_cells)];
-    }
+void dfUEqn::correctPsi(double *Psi, double *boundary_psi) {
+    checkCudaErrors(cudaMemcpy(d_u_host_order, Psi, dataBase_.cell_value_vec_bytes, cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(d_boundary_u_host_order, boundary_psi, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyHostToDevice));
+    permute_vector_h2d(dataBase_.stream, dataBase_.num_cells, d_u_host_order, dataBase_.d_u);
+    permute_vector_h2d(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_u_host_order, dataBase_.d_boundary_u);
+}
 
-    // b
-    std::vector<double> h_b_of_init_vec(3 * num_cells);
-    std::copy(of_b, of_b + 3 * num_cells, h_b_of_init_vec.begin());
-    std::vector<double> h_b_of_vec;
-    for (int i = 0; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
-    }
-    // fill RHS_y
-    for (int i = 1; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
-    }
-    // fill RHS_z
-    for (int i = 2; i < 3 * num_cells; i += 3)
-    {
-        h_b_of_vec.push_back(h_b_of_init_vec[i]);
+double* dfUEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
     }
 
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            printf("h_A_of_vec[%d]:(%.10lf, %.10lf, %.10lf)\n", i, h_A_of_vec_perm[i], h_A_of_vec_perm[i + (num_faces + num_cells)], h_A_of_vec_perm[i + (num_faces + num_cells) * 2]);
-        for (int i = 0; i < num_cells; i++)
-            printf("h_b_of_vec[%d]: (%.10lf, %.10lf, %.10lf)\n", i, of_b[i * 3], of_b[i * 3 + 1], of_b[i * 3 + 2]);
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
     }
 
-    // check
-    // fprintf(stderr, "check of h_A_csr\n");
-    // checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
-    // fprintf(stderr, "check of h_b\n");
-    // checkVectorEqual(3 * num_cells, h_b_of_vec.data(), h_b, 1e-5);
+    return pointer;
 }
 
-void dfUEqn::solve()
+// #if defined DEBUG_
+void dfUEqn::compareResult(const double *lower, const double *upper, const double *diag, 
+        const double *source, const double *internal_coeffs, const double *boundary_coeffs, 
+        // const double *tmpVal, const double *boundary_val,
+        bool printFlag)
 {
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_velocity_old[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_velocity_old[3*i],
-    //     h_velocity_old[3*i + 1], h_velocity_old[3*i + 2]);
-    // constructor AmgXSolver at first interation
-    // Synchronize stream
-    // checkCudaErrors(cudaMemcpyAsync(h_b, d_b, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    // checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    // nvtxRangePush("solve");
+    DEBUG_TRACE;
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower\n");
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper\n");
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag\n");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source, h_source_ref;
+    h_source.resize(dataBase_.num_cells * 3);
+    h_source_ref.resize(dataBase_.num_cells * 3);
+    for (int i = 0; i < dataBase_.num_cells; i++) {
+        h_source_ref[0 * dataBase_.num_cells + i] = source[i * 3 + 0];
+        h_source_ref[1 * dataBase_.num_cells + i] = source[i * 3 + 1];
+        h_source_ref[2 * dataBase_.num_cells + i] = source[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source\n");
+    checkVectorEqual(dataBase_.num_cells * 3, h_source_ref.data(), h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_internal_coeffs, h_internal_coeffs_ref;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_internal_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_internal_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 0];
+        h_internal_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 1];
+        h_internal_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = internal_coeffs[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_internal_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_internal_coeffs_ref.data(), h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_coeffs, h_boundary_coeffs_ref;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces * 3);
+    h_boundary_coeffs_ref.resize(dataBase_.num_boundary_surfaces * 3);
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++) {
+        h_boundary_coeffs_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 0];
+        h_boundary_coeffs_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 1];
+        h_boundary_coeffs_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_coeffs[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_coeffs_ref.data(), h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    // std::vector<double> h_tmpVal, h_tmpVal_ref, h_boundary_val, h_boundary_val_ref;
+    // h_tmpVal.resize(dataBase_.num_cells * 9);
+    // h_tmpVal_ref.resize(dataBase_.num_cells * 9);
+    // h_boundary_val.resize(dataBase_.num_boundary_surfaces * 9);
+    // h_boundary_val_ref.resize(dataBase_.num_boundary_surfaces * 9);
+    // for (int i = 0; i < dataBase_.num_cells; i++) {
+    //     h_tmpVal_ref[0 * dataBase_.num_cells + i] = tmpVal[i * 9 + 0];
+    //     h_tmpVal_ref[1 * dataBase_.num_cells + i] = tmpVal[i * 9 + 1];
+    //     h_tmpVal_ref[2 * dataBase_.num_cells + i] = tmpVal[i * 9 + 2];
+    //     h_tmpVal_ref[3 * dataBase_.num_cells + i] = tmpVal[i * 9 + 3];
+    //     h_tmpVal_ref[4 * dataBase_.num_cells + i] = tmpVal[i * 9 + 4];
+    //     h_tmpVal_ref[5 * dataBase_.num_cells + i] = tmpVal[i * 9 + 5];
+    //     h_tmpVal_ref[6 * dataBase_.num_cells + i] = tmpVal[i * 9 + 6];
+    //     h_tmpVal_ref[7 * dataBase_.num_cells + i] = tmpVal[i * 9 + 7];
+    //     h_tmpVal_ref[8 * dataBase_.num_cells + i] = tmpVal[i * 9 + 8];
+    // }
+    // for (int i = 0; i < dataBase_.num_boundary_surfaces; i++){
+    //     h_boundary_val_ref[0 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 0];
+    //     h_boundary_val_ref[1 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 1];
+    //     h_boundary_val_ref[2 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 2];
+    //     h_boundary_val_ref[3 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 3];
+    //     h_boundary_val_ref[4 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 4];
+    //     h_boundary_val_ref[5 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 5];
+    //     h_boundary_val_ref[6 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 6];
+    //     h_boundary_val_ref[7 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 7];
+    //     h_boundary_val_ref[8 * dataBase_.num_boundary_surfaces + i] = boundary_val[i * 9 + 8];
+    // }
+    // checkCudaErrors(cudaMemcpy(h_tmpVal.data(), d_grad_u, dataBase_.cell_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    // checkCudaErrors(cudaMemcpy(h_boundary_val.data(), d_boundary_grad_u, dataBase_.boundary_surface_value_tsr_bytes, cudaMemcpyDeviceToHost));
+    // fprintf(stderr, "check h_grad_U\n");
+    // checkVectorEqual(dataBase_.num_cells * 9, h_tmpVal_ref.data(), h_tmpVal.data(), 1e-14, printFlag);
+    // fprintf(stderr, "check h_boundary_grad_U\n");
+    // checkVectorEqual(dataBase_.num_boundary_surfaces * 9, h_boundary_val_ref.data(), h_boundary_val.data(), 1e-14, printFlag);
+    // DEBUG_TRACE;
+}
+// #endif
+void dfUEqn::compareHbyA(const double *HbyA, const double *boundary_HbyA, bool printFlag)
+{
+    double *h_HbyA = new double[dataBase_.num_cells * 3];
+    double *h_HbyA_ref = new double[dataBase_.num_cells * 3];
+    double *h_boundary_HbyA = new double[dataBase_.num_boundary_surfaces * 3];
+    double *h_boundary_HbyA_ref = new double[dataBase_.num_boundary_surfaces * 3];
 
-    int nNz = num_cells + num_faces; // matrix entries
-    if (num_iteration == 0)          // first interation
+    // permute
+    for (int i = 0; i < dataBase_.num_cells; i++)
     {
-        printf("Initializing AmgX Linear Solver\n");
-        UxSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr);
-        UySolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + nNz);
-        UzSolver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + 2 * nNz);
+        h_HbyA_ref[dataBase_.num_cells * 0 + i] = HbyA[i * 3 + 0];
+        h_HbyA_ref[dataBase_.num_cells * 1 + i] = HbyA[i * 3 + 1];
+        h_HbyA_ref[dataBase_.num_cells * 2 + i] = HbyA[i * 3 + 2];
     }
-    else
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++)
     {
-        UxSolver->updateOperator(num_cells, nNz, d_A_csr);
-        UySolver->updateOperator(num_cells, nNz, d_A_csr + nNz);
-        UzSolver->updateOperator(num_cells, nNz, d_A_csr + 2 * nNz);
+        h_boundary_HbyA_ref[dataBase_.num_boundary_surfaces * 0 + i] = boundary_HbyA[i * 3 + 0];
+        h_boundary_HbyA_ref[dataBase_.num_boundary_surfaces * 1 + i] = boundary_HbyA[i * 3 + 1];
+        h_boundary_HbyA_ref[dataBase_.num_boundary_surfaces * 2 + i] = boundary_HbyA[i * 3 + 2];
     }
-    UxSolver->solve(num_cells, d_psi, d_b);
-    UySolver->solve(num_cells, d_psi + num_cells, d_b + num_cells);
-    UzSolver->solve(num_cells, d_psi + 2 * num_cells, d_b + 2 * num_cells);
-    num_iteration++;
-
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_d2h<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi, d_psi_permute);
-    checkCudaErrors(cudaMemcpyAsync(h_psi, d_psi_permute, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_velocity_after[%d]: (%.15lf, %.15lf, %.15lf)\n", i, h_psi[i],
-    //     h_psi[num_cells + i], h_psi[num_cells*2 + i]);
+    checkCudaErrors(cudaMemcpy(h_HbyA, dataBase_.d_HbyA, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_HbyA, dataBase_.d_boundary_HbyA, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    
+    // check result
+    fprintf(stderr, "check h_HbyA\n");
+    checkVectorEqual(dataBase_.num_cells * 3, h_HbyA_ref, h_HbyA, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_HbyA\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_HbyA_ref, h_boundary_HbyA, 1e-10, printFlag);
 }
 
-void dfUEqn::sync()
+void dfUEqn::compareU(const double *U, const double *boundary_U, bool printFlag)
 {
-    checkCudaErrors(cudaStreamSynchronize(stream));
-}
+    double *h_u = new double[dataBase_.num_cells * 3];
+    double *h_u_ref = new double[dataBase_.num_cells * 3];
+    double *h_boundary_u = new double[dataBase_.num_boundary_surfaces * 3];
+    double *h_boundary_u_ref = new double[dataBase_.num_boundary_surfaces * 3];
 
-void dfUEqn::updatePsi(double *Psi)
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    memcpy(Psi, h_psi, cell_vec_bytes);
+    // permute
+    for (int i = 0; i < dataBase_.num_cells; i++)
+    {
+        h_u_ref[dataBase_.num_cells * 0 + i] = U[i * 3 + 0];
+        h_u_ref[dataBase_.num_cells * 1 + i] = U[i * 3 + 1];
+        h_u_ref[dataBase_.num_cells * 2 + i] = U[i * 3 + 2];
+    }
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++)
+    {
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 0 + i] = boundary_U[i * 3 + 0];
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 1 + i] = boundary_U[i * 3 + 1];
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 2 + i] = boundary_U[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpy(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_u, dataBase_.d_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+
+    // check result
+    fprintf(stderr, "check h_u\n");
+    checkVectorEqual(dataBase_.num_cells * 3, h_u_ref, h_u, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_u\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_u_ref, h_boundary_u, 1e-10, printFlag);
 }
 
-void dfUEqn::correctBoundaryConditions()
+void dfUEqn::comparerAU(const double *rAU, const double *boundary_rAU, bool printFlag)
 {
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    ueqn_correct_BoundaryConditions_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, 
-                                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                              d_psi, dataBase_.d_boundary_velocity, dataBase_.d_boundary_UpatchType);
-}
+    double *h_rAU = new double[dataBase_.num_cells];
+    double *h_boundary_rAU = new double[dataBase_.num_boundary_surfaces];
 
-// correct volecity in pEqn
-void dfUEqn::correctPsi(double *Psi)
-{
-    memcpy(h_psi, Psi, cell_vec_bytes);
-    checkCudaErrors(cudaMemcpyAsync(d_psi_permute, h_psi, cell_vec_bytes, cudaMemcpyDeviceToHost, stream));
+    checkCudaErrors(cudaMemcpy(h_rAU, dataBase_.d_rAU, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_rAU, dataBase_.d_boundary_rAU, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
 
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    permute_psi_h2d<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, d_psi_permute, d_psi);
-}
+    fprintf(stderr, "check h_rAU\n");
+    checkVectorEqual(dataBase_.num_cells, rAU, h_rAU, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_rAU\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rAU, h_boundary_rAU, 1e-10, printFlag);
 
-dfUEqn::~dfUEqn()
-{
+    delete h_rAU;
+    delete h_boundary_rAU;
 }
diff --git a/src_gpu/dfYEqn.H b/src_gpu/dfYEqn.H
index 751cb32fe..b3715efc4 100644
--- a/src_gpu/dfYEqn.H
+++ b/src_gpu/dfYEqn.H
@@ -3,65 +3,167 @@
 #include "AmgXSolver.H"
 #include <amgx_c.h>
 #include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+#include "dfChemistrySolver.H"
 
 class dfYEqn
 {
 private:
-    dfMatrixDataBase &dataBase_;
-    cudaStream_t stream;
+	dfMatrixDataBase &dataBase_;
+    dfChemistrySolver &chemistrySolver_;
 
+    // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph;
+    cudaGraphExec_t graph_instance;
+    bool graph_created=false;
+#endif
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
+    int inertIndex;
+
+	// constant values -- amgx solvers
     std::vector<AmgXSolver *> YSolverSet;
     int num_iteration = 0;
 
-    // common variables
-    int num_cells, cell_bytes, num_faces, num_surfaces, num_boundary_cells, num_boundary_faces, num_species, boundary_face_bytes, inertIndex;
-    int *d_A_csr_row_index, *d_A_csr_diag_index, *d_A_csr_col_index;
-
-    // Matrix variables
-    double *d_A_csr, *d_b, *d_psi = nullptr;
-    double *h_A_csr, *h_b, *h_psi = nullptr;
-
-    double *d_alpha = nullptr;
-    double *d_mut_Sct, *d_boundary_mut_sct = nullptr;
-    double *d_hai, *d_boundary_hai = nullptr;
-    double *d_rhoD, *d_boundary_rhoD = nullptr;
-    double *d_sum_hai_rhoD_grady, *d_sum_boundary_hai_rhoD_grady = nullptr; 
-    double *d_sum_rhoD_grady, *d_sum_boundary_rhoD_grady = nullptr;
-    double *d_sum_hai_y, *d_sum_boundary_hai_y = nullptr;
-    double *d_phiUc, *d_phiUc_boundary = nullptr;
-    double *d_boundary_Y = nullptr;
-    double *d_grady, *d_boundary_grady = nullptr;
-
-    bool uploadData = true;
+	// constant fields - internal
+	// 无
+
+	// constant fields - boundary
+	std::vector<int> patch_type;
+
+    // const fields - lewis number
+    double *d_lewis_number = nullptr;
+    std::vector<double> lewis_number;
+
+	// non-constant fields - internal
+	// thermophysical fields
+    double *d_hai = nullptr;
+    double *d_mut_sct = nullptr;
+    // intermediate fields
+    double *d_grad_y = nullptr;
+    double *d_sumY_diff_error = nullptr;
+    double *d_phiUc = nullptr;
+    double *d_DEff = nullptr;
+    double *d_permute = nullptr;
+    // combustion fields
+    double *d_RR = nullptr;
+    // computed on CPU, used on GPU, need memcpyh2d
+	double *h_hai = nullptr;
+	double *h_rhoD = nullptr;
+	double *h_mut_sct = nullptr;
+
+	// non-constant fields - boundary
+	// thermophysical fields
+    double *d_boundary_hai = nullptr;
+    double *d_boundary_mut_sct = nullptr;
+    // intermediate fields
+    double *d_boundary_grad_y = nullptr;
+    double *d_boundary_sumY_diff_error = nullptr;
+    double *d_boundary_phiUc = nullptr;
+    double *d_boundary_DEff = nullptr;
+    double *d_boundary_permute = nullptr;
+	// computed on CPU, used on GPU, need memcpyh2d - host
+	double *h_boundary_hai = nullptr;
+	double *h_boundary_rhoD = nullptr;
+	double *h_boundary_mut_sct = nullptr;
+    // boundary coeff fields
+	double *d_value_internal_coeffs = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+
+	// non-constant fields - ldu
+    double *d_ldu = nullptr;
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+	double *d_diag = nullptr;
+    double *d_extern = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+
+	// non-constant fields - csr
+	double *d_A = nullptr;
+	double *d_b = nullptr;
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
 
 public:
-    dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex);
-
-    ~dfYEqn();
-
-    void initializeTimeStep();
-
-    void checkValue(bool print, char *filename);
-
-    void upwindWeight();
-
-    void fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(
-            std::vector<double *> Y_old, std::vector<double *> boundary_Y,
-            std::vector<const double *> hai, std::vector<double *> boundary_hai,
-            std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
-            const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha);
-
-    void fvm_ddt();
-
-    void fvm_div_phi();
-
-    void fvm_div_phiUc();
-
-    void solve();
-
-    void correctBoundaryConditions();
-
+	// 构造函数
+    dfYEqn(dfMatrixDataBase &dataBase, dfChemistrySolver &chemistrySolver)
+        : dataBase_(dataBase), chemistrySolver_(chemistrySolver) {}
+
+	// 析构函数
+	~dfYEqn(){}
+
+	// 成员函数
+
+    // getter函数
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
+
+	// 初始化构建
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex);
+	void setConstantFields(const std::vector<int> patch_type, const std::vector<double> lewis_number);
+	void createNonConstantFieldsInternal();
+	void createNonConstantFieldsBoundary();
+	void createNonConstantLduAndCsrFields();
+	void initNonConstantFieldsInternal(const double *y);
+	void initNonConstantFieldsBoundary(const double *boundary_y);
+
+    void cleanCudaResources();
+
+	// 方程运行
+    void preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
+        const double *h_hai, const double *h_boundary_hai,
+        const double *h_mut_sct, const double *h_boundary_mut_sct);
+	void process();
+    void postProcess(double *h_y, double *h_boundary_y);
+    void solve(int speciesIndex);
     void sync();
 
-    void updatePsi(double *Psi, int speciesIndex);
+    // 方程特化版离散函数
+    void yeqn_compute_thermo_alpha(cudaStream_t stream,
+            int num_cells, const double *rhoD, double *thermo_alpha,
+            int num_boundary_surfaces, const double *boundary_rhoD, double *boundary_thermo_alpha);
+    void yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
+            double *lewis_number, const double *alpha, const double *mut_sct, double *DEff,
+            const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff);
+    void yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T, 
+            const double *p, const double *y, const double *rho, double *RR);
+    void yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
+            int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces,
+            const int *lowerAddr, const int *upperAddr,
+            const double *weight, const double *mag_sf, const double *delta_coeffs, const double *volume,
+            const double *thermo_alpha, const double *hai, const double *vf, double *output, // end for internal
+            int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face,
+            const double *boundary_weight, const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+            const double *boundary_thermo_alpha, const double *boundary_hai, const double *boundary_vf,
+            const int *cyclicNeighbor, const int *patchSizeOffset, double *boundary_output);
+    void yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces,
+            const double *rhoD, const double *hai, const double *y, const double *grad_y,
+            double *sumY_diff_error, double *hDiff_corr_flux,
+            const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD,
+            double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux);
+    void yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+            const int *lowerAddr, const int *upperAddr,
+            const double *weight, const double *sf, const double *sumY_diff_error, double *phiUc,
+            const double *boundary_sf, const double *boundary_sumY_diff_error, double *boundary_phiUc);
+    void yeqn_compute_y_inertIndex(cudaStream_t stream, int num_species, int inertIndex, int num_cells, double *y);
+
+// #if defined DEBUG_
+    void comparediffAlphaD(const double *diffAlphaD, const double *boundary_diffAlphaD, bool printFlag);
+    void comparegradyi(const double *grad_yi, const double *boundary_grad_yi, int specie_index, bool printFlag);
+    void comparesumYDiffError(const double *sumYDiffError, const double *boundary_sumYDiffError, bool printFlag);
+    void comparehDiffCorrFlux(const double *hDiffCorrFlux, const double *boundary_hDiffCorrFlux, bool printFlag);
+    void comparephiUc(const double *phiUc, const double *boundary_phiUc, bool printFlag);
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source,
+            const double *internal_coeffs, const double *boundary_coeffs, bool printFlag);
+    void compareYi(const double *yi, int specie_index, bool printFlag);
+// #endif
 };
diff --git a/src_gpu/dfYEqn.cu b/src_gpu/dfYEqn.cu
index 990a96d1d..b86cb5a69 100644
--- a/src_gpu/dfYEqn.cu
+++ b/src_gpu/dfYEqn.cu
@@ -1,174 +1,93 @@
 #include "dfYEqn.H"
 
-// kernel functions
-__global__ void getUpwindWeight(int num_faces, double *phi, double *weight)
+__global__ void yeqn_compute_thermo_alpha_internal(int num_cells,
+        const double *rhoD, double *thermo_alpha)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_faces)
+    if (index >= num_cells)
         return;
-    if (phi[index] >= 0)
-        weight[index] = 1.;
-    else
-        weight[index] = 0.;
+
+    // UnityLewis
+    // alpha = nu * rho / 0.7
+    // rhoD[i] = alpha
+    thermo_alpha[index] = rhoD[index];
 }
 
-__global__ void fvc_grad_internal(int num_cells, int num_species,
-        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-        const double *face_vector, const double *weight, const double *species,
-        const double *volume, double *grady)
+__global__ void yeqn_compute_thermo_alpha_boundary(int num_boundary_surfaces,
+        const double *boundary_rhoD, double *boundary_thermo_alpha)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num_boundary_surfaces)
         return;
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
+    // UnityLewis
+    // alpha = nu * rho / 0.7
+    // rhoD[i] = alpha
+    boundary_thermo_alpha[index] = boundary_rhoD[index];
+}
 
-    double vol = volume[index];
+__global__ void yeqn_compute_DEff_kernel(int num_species, int num,
+        const double *lewis_number, const double *thermo_alpha, const double *mut_sct, double *DEff)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
 
+    double mutsct = mut_sct[index];
+    double rhoD;
     for (int s = 0; s < num_species; s++) {
-        double own_cell_Y = species[num_cells * s + index];
-        double grad_bx = 0;
-        double grad_by = 0;
-        double grad_bz = 0;
-        for (int i = row_index; i < next_row_index; i++)
-        {
-            int inner_index = i - row_index;
-            // lower
-            if (inner_index < diag_index)
-            {
-                int neighbor_index = neighbor_offset + inner_index;
-                double w = weight[neighbor_index];
-                double sfx = face_vector[neighbor_index * 3 + 0];
-                double sfy = face_vector[neighbor_index * 3 + 1];
-                double sfz = face_vector[neighbor_index * 3 + 2];
-                int neighbor_cell_id = csr_col_index[row_index + inner_index];
-                double neighbor_cell_Y = species[num_cells * s + neighbor_cell_id];
-                double face_Y = w * (neighbor_cell_Y - own_cell_Y) + own_cell_Y;
-                grad_bx -= face_Y * sfx;
-                grad_by -= face_Y * sfy;
-                grad_bz -= face_Y * sfz;
-            }
-            // upper
-            if (inner_index > diag_index)
-            {
-                int neighbor_index = neighbor_offset + inner_index - 1;
-                double w = weight[neighbor_index];
-                double sfx = face_vector[neighbor_index * 3 + 0];
-                double sfy = face_vector[neighbor_index * 3 + 1];
-                double sfz = face_vector[neighbor_index * 3 + 2];
-                int neighbor_cell_id = csr_col_index[row_index + inner_index];
-                double neighbor_cell_Y = species[num_cells * s + neighbor_cell_id];
-                double face_Y = w * (own_cell_Y - neighbor_cell_Y) + neighbor_cell_Y;
-                grad_bx += face_Y * sfx;
-                grad_by += face_Y * sfy;
-                grad_bz += face_Y * sfz;
-            }
-    }
-    grady[num_cells * s * 3 + index * 3 + 0] = grad_bx / vol;
-    grady[num_cells * s * 3 + index * 3 + 1] = grad_by / vol;
-    grady[num_cells * s * 3 + index * 3 + 2] = grad_bz / vol;
+        rhoD = thermo_alpha[index] / lewis_number[s]; // le = alpha / D
+        DEff[num * s + index] =  rhoD + mutsct;
     }
 }
-__global__ void fvc_grad_boundary(int num_cells, int num_boundary_cells, int num_boundary_faces, int num_species,
-        const int *boundary_cell_offset, const int *boundary_cell_id, const int *bouPermedIndex,
-        const double *boundary_face_vector, const double *boundary_species_init,
-        const double *volume, const double *grady_input, double *grady_output, bool uploadData)
+
+__global__ void yeqn_compute_phiUc_internal(int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *weight, const double *sf, const double *sumY_diff_error, double *phiUc)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_surfaces)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
 
-    double vol = volume[index];
+    double sfx = sf[num_surfaces * 0 + index];
+    double sfy = sf[num_surfaces * 1 + index];
+    double sfz = sf[num_surfaces * 2 + index];
 
-    // compute boundary gradient
-    for (int s = 0; s < num_species; s++) {
-    double grad_bx = 0;
-    double grad_by = 0;
-    double grad_bz = 0;
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        double sfx = boundary_face_vector[i * 3 + 0];
-        double sfy = boundary_face_vector[i * 3 + 1];
-        double sfz = boundary_face_vector[i * 3 + 2];
-        double face_Y;
-        if (!uploadData)
-        {
-            face_Y = boundary_species_init[num_boundary_faces * s + i];
-        }
-        else
-        {
-            int permute_index = bouPermedIndex[i];
-            face_Y = boundary_species_init[num_boundary_faces * s + permute_index];
-        }
-        grad_bx += face_Y * sfx;
-        grad_by += face_Y * sfy;
-        grad_bz += face_Y * sfz;
-    }
+    double w = weight[index]; 
+    double ssfx = (w * (sumY_diff_error[num_cells * 0 + owner] - sumY_diff_error[num_cells * 0 + neighbor]) + sumY_diff_error[num_cells * 0 + neighbor]);
+    double ssfy = (w * (sumY_diff_error[num_cells * 1 + owner] - sumY_diff_error[num_cells * 1 + neighbor]) + sumY_diff_error[num_cells * 1 + neighbor]);
+    double ssfz = (w * (sumY_diff_error[num_cells * 2 + owner] - sumY_diff_error[num_cells * 2 + neighbor]) + sumY_diff_error[num_cells * 2 + neighbor]);
 
-    grady_output[num_cells * s * 3 + cell_index * 3 + 0] =
-        grady_input[num_cells * s * 3 + cell_index * 3 + 0] + grad_bx / vol;
-    grady_output[num_cells * s * 3 + cell_index * 3 + 1] =
-        grady_input[num_cells * s * 3 + cell_index * 3 + 1] + grad_by / vol;
-    grady_output[num_cells * s * 3 + cell_index * 3 + 2] =
-        grady_input[num_cells * s * 3 + cell_index * 3 + 2] + grad_bz / vol;
-    }
+    phiUc[index] = sfx * ssfx + sfy * ssfy + sfz * ssfz;
 }
-__global__ void correct_boundary_conditions(int num_cells, int num_boundary_cells, int num_boundary_faces, int num_species,
-                                                const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                const double *boundary_sf, const double *mag_sf,
-                                                const double *grady, double* boundary_grady, const double *boundary_deltaCoeffs,
-                                                const double *Y, const double *boundary_Y, const int *Y_patch_type)
+ 
+__global__ void yeqn_compute_phiUc_boundary(int num_boundary_surfaces,
+        const double *boundary_sf, const double *boundary_sumY_diff_error, double *boundary_phiUc)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_boundary_surfaces)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
+    double boundary_sfx = boundary_sf[num_boundary_surfaces * 0 + index];
+    double boundary_sfy = boundary_sf[num_boundary_surfaces * 1 + index];
+    double boundary_sfz = boundary_sf[num_boundary_surfaces * 2 + index];
 
-    for (int s = 0; s < num_species; s++) {
-        // initialize boundary_sumYDiffError
-        double grady_x = grady[num_cells * s * 3 + cell_index * 3 + 0];
-        double grady_y = grady[num_cells * s * 3 + cell_index * 3 + 1];
-        double grady_z = grady[num_cells * s * 3 + cell_index * 3 + 2];
-        double internal_Y = Y[num_cells * s + cell_index];
-
-        for (int i = cell_offset; i < next_cell_offset; i++)
-        {
-            double n_x = boundary_sf[i * 3 + 0] / mag_sf[i];
-            double n_y = boundary_sf[i * 3 + 1] / mag_sf[i];
-            double n_z = boundary_sf[i * 3 + 2] / mag_sf[i];
-            int patchIndex = Y_patch_type[i];
-            double sn_grad;
-            if (patchIndex == 0) { // zeroGradient
-                sn_grad = 0;
-            } else if (patchIndex == 1) { // fixedValue
-                sn_grad = boundary_deltaCoeffs[i] * (boundary_Y[num_boundary_faces * s + i] - internal_Y);
-            }
-            // TODO: implement other BCs
-            double grad_correction = sn_grad - (n_x * grady_x + n_y * grady_y + n_z * grady_z);
-            boundary_grady[num_boundary_faces * s * 3 + i * 3 + 0] = grady_x + grad_correction * n_x;
-            boundary_grady[num_boundary_faces * s * 3 + i * 3 + 1] = grady_y + grad_correction * n_y;
-            boundary_grady[num_boundary_faces * s * 3 + i * 3 + 2] = grady_z + grad_correction * n_z;
-        }
-    }
-}
+    double boundary_ssfx = boundary_sumY_diff_error[num_boundary_surfaces * 0 + index];
+    double boundary_ssfy = boundary_sumY_diff_error[num_boundary_surfaces * 1 + index];
+    double boundary_ssfz = boundary_sumY_diff_error[num_boundary_surfaces * 2 + index];
 
-__global__ void sumError_internal(int num_cells, int num_species,
-        const double *hai, const double *rhoD, const double *y, const double *grady,
-        double *sum_hai_rhoD_grady, double *sum_rhoD_grady, double *sum_hai_y)
+    boundary_phiUc[index] = boundary_sfx * boundary_ssfx + boundary_sfy * boundary_ssfy + boundary_sfz * boundary_ssfz;
+}
+ 
+__global__ void yeqn_sumError_and_compute_hDiffCorrFlux(int num_species, int num,
+        const double *rhoD, const double *hai, const double *y, const double *grady,
+        double *sum_rhoD_grady, double *hDiffCorrFlux)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
+    if (index >= num)
         return;
 
     double sum_hai_rhoD_grady_x = 0;
@@ -177,1020 +96,914 @@ __global__ void sumError_internal(int num_cells, int num_species,
     double sum_rhoD_grady_x = 0;
     double sum_rhoD_grady_y = 0;
     double sum_rhoD_grady_z = 0;
-    double sum_hai_y_value = 0;
+    double sum_hai_y = 0;
     for (int s = 0; s < num_species; s++) {
-        double hai_value = hai[num_cells * s + index];
-        double rhoD_value = rhoD[num_cells * s + index];
-        double y_value = y[num_cells * s + index];
-        double grady_x = grady[num_cells * s * 3 + index * 3 + 0];
-        double grady_y = grady[num_cells * s * 3 + index * 3 + 1];
-        double grady_z = grady[num_cells * s * 3 + index * 3 + 2];
+        double hai_value = hai[num * s + index];
+        double rhoD_value = rhoD[num * s + index]; // le = alpha/D
+        double y_value = y[num * s + index];
+        double grady_x = grady[num * s * 3 + num * 0 + index];
+        double grady_y = grady[num * s * 3 + num * 1 + index];
+        double grady_z = grady[num * s * 3 + num * 2 + index];
         sum_hai_rhoD_grady_x += hai_value * rhoD_value * grady_x;
         sum_hai_rhoD_grady_y += hai_value * rhoD_value * grady_y;
         sum_hai_rhoD_grady_z += hai_value * rhoD_value * grady_z;
         sum_rhoD_grady_x += rhoD_value * grady_x;
         sum_rhoD_grady_y += rhoD_value * grady_y;
         sum_rhoD_grady_z += rhoD_value * grady_z;
-        sum_hai_y_value += hai_value * y_value;
+        sum_hai_y += hai_value * y_value;
     }
-    sum_hai_rhoD_grady[index * 3 + 0] = sum_hai_rhoD_grady_x;
-    sum_hai_rhoD_grady[index * 3 + 1] = sum_hai_rhoD_grady_y;
-    sum_hai_rhoD_grady[index * 3 + 2] = sum_hai_rhoD_grady_z;
-    sum_rhoD_grady[index * 3 + 0] = sum_rhoD_grady_x;
-    sum_rhoD_grady[index * 3 + 1] = sum_rhoD_grady_y;
-    sum_rhoD_grady[index * 3 + 2] = sum_rhoD_grady_z;
-    sum_hai_y[index] = sum_hai_y_value;
+    sum_rhoD_grady[num * 0 + index] = sum_rhoD_grady_x;
+    sum_rhoD_grady[num * 1 + index] = sum_rhoD_grady_y;
+    sum_rhoD_grady[num * 2 + index] = sum_rhoD_grady_z;
+    hDiffCorrFlux[num * 0 + index] = (sum_hai_rhoD_grady_x - sum_hai_y * sum_rhoD_grady_x);
+    hDiffCorrFlux[num * 1 + index] = (sum_hai_rhoD_grady_y - sum_hai_y * sum_rhoD_grady_y);
+    hDiffCorrFlux[num * 2 + index] = (sum_hai_rhoD_grady_z - sum_hai_y * sum_rhoD_grady_z);
 }
 
-__global__ void sumError_boundary(int num_boundary_faces, int num_species, const int *bouPermedIndex,
-        const double *boundary_hai, const double *boundary_rhoD, const double *boundary_y, const double *boundary_grady,
-        double *sum_boundary_hai_rhoD_grady, double *sum_boundary_rhoD_grady, double *sum_boundary_hai_y, bool uploadData)
+__global__ void yeqn_fvc_laplacian_scalar_internal(int num_species, int num_cells, int num_surfaces,
+        const int *lower_index, const int *upper_index,
+        const double *mag_sf, const double *delta_coeffs, const double *weight,
+        const double *thermo_alpha, const double *hai, const double *vf, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
+    if (index >= num_surfaces)
         return;
 
-    int permute_index, permute_index_Y;
-    if (!uploadData)
-    {
-        permute_index_Y = index;
-    }
-    else
-    {
-        permute_index_Y = bouPermedIndex[index];
-    }
-    permute_index = bouPermedIndex[index];
-    
-    double sum_boundary_hai_rhoD_grady_x = 0;
-    double sum_boundary_hai_rhoD_grady_y = 0;
-    double sum_boundary_hai_rhoD_grady_z = 0;
-    double sum_boundary_rhoD_grady_x = 0;
-    double sum_boundary_rhoD_grady_y = 0;
-    double sum_boundary_rhoD_grady_z = 0;
-    double sum_boundary_hai_y_value = 0;
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double w = weight[index];
+    double magsf = mag_sf[index];
+    double delta_coeff = delta_coeffs[index];
+    double thermo_alpha_owner = thermo_alpha[owner];
+    double thermo_alpha_neighbor = thermo_alpha[neighbor];
+
+    //if (owner == 21 || neighbor == 21)
+    //   printf("input index: %d, thermo: %.16lf, %.16lf\n", index, thermo_alpha_owner, thermo_alpha_neighbor);
+    double sum_ssf = 0;
     for (int s = 0; s < num_species; s++) {
-        double boundary_hai_value = boundary_hai[num_boundary_faces * s + permute_index];
-        double boundary_rhoD_value = boundary_rhoD[num_boundary_faces * s + permute_index];
-        double boundary_y_value = boundary_y[num_boundary_faces * s + permute_index_Y];
-        double boundary_grady_x = boundary_grady[num_boundary_faces * s * 3 + index * 3 + 0];
-        double boundary_grady_y = boundary_grady[num_boundary_faces * s * 3 + index * 3 + 1];
-        double boundary_grady_z = boundary_grady[num_boundary_faces * s * 3 + index * 3 + 2];
-        sum_boundary_hai_rhoD_grady_x += boundary_hai_value * boundary_rhoD_value * boundary_grady_x;
-        sum_boundary_hai_rhoD_grady_y += boundary_hai_value * boundary_rhoD_value * boundary_grady_y;
-        sum_boundary_hai_rhoD_grady_z += boundary_hai_value * boundary_rhoD_value * boundary_grady_z;
-        sum_boundary_rhoD_grady_x += boundary_rhoD_value * boundary_grady_x;
-        sum_boundary_rhoD_grady_y += boundary_rhoD_value * boundary_grady_y;
-        sum_boundary_rhoD_grady_z += boundary_rhoD_value * boundary_grady_z;
-        sum_boundary_hai_y_value += boundary_hai_value * boundary_y_value;
+        double haii_owner = hai[num_cells * s + owner];
+        double haii_neighbor = hai[num_cells * s + neighbor];
+        double gamma = w * (thermo_alpha_owner * haii_owner) + (1 - w) * (thermo_alpha_neighbor * haii_neighbor);
+        double sngrad = delta_coeff * (vf[num_cells * s + neighbor] - vf[num_cells * s + owner]);
+        double ssf = gamma * sngrad * magsf;
+        sum_ssf += ssf;
+        //if (owner == 21 || neighbor == 21)
+        //    printf("hai: %.16lf, %.16lf, gamma: %.16lf, sngrad: %.16lf, ssf: %.16lf\n", haii_owner, haii_neighbor, gamma, sngrad, ssf);
     }
-    sum_boundary_hai_rhoD_grady[index * 3 + 0] = sum_boundary_hai_rhoD_grady_x;
-    sum_boundary_hai_rhoD_grady[index * 3 + 1] = sum_boundary_hai_rhoD_grady_y;
-    sum_boundary_hai_rhoD_grady[index * 3 + 2] = sum_boundary_hai_rhoD_grady_z;
-    sum_boundary_rhoD_grady[index * 3 + 0] = sum_boundary_rhoD_grady_x;
-    sum_boundary_rhoD_grady[index * 3 + 1] = sum_boundary_rhoD_grady_y;
-    sum_boundary_rhoD_grady[index * 3 + 2] = sum_boundary_rhoD_grady_z;
-    sum_boundary_hai_y[index] = sum_boundary_hai_y_value;
+
+    // owner
+    atomicAdd(&(output[owner]), sum_ssf);
+    // neighbor
+    atomicAdd(&(output[neighbor]), -sum_ssf);
 }
 
-__global__ void calculate_hDiffCorrFlux(int num,
-        const double *sum_hai_rhoD_grady, const double *sum_rhoD_grady, const double *sum_hai_y, double *hDiffCorrFlux)
+__global__ void yeqn_fvc_laplacian_scalar_boundary_fixedValue(int num_species, int num_cells, int num_boundary_surfaces,
+        int num, int offset, const int *face2Cells,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_thermo_alpha, const double *boundary_hai,
+        const double *vf, const double *boundary_vf, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num)
         return;
 
-    hDiffCorrFlux[index * 3 + 0] += (sum_hai_rhoD_grady[index * 3 + 0] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 0]);
-    hDiffCorrFlux[index * 3 + 1] += (sum_hai_rhoD_grady[index * 3 + 1] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 1]);
-    hDiffCorrFlux[index * 3 + 2] += (sum_hai_rhoD_grady[index * 3 + 2] - sum_hai_y[index] * sum_rhoD_grady[index * 3 + 2]);
-}
+    int start_index = offset + index;
+    int cellIndex = face2Cells[start_index];
 
-__global__ void calculate_phiUc_internal(int num_cells,
-        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-        const double *face_vector, const double *weight, const double *sumYDiffError, double *phiUc)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+    double boundary_delta_coeff = boundary_delta_coeffs[start_index];
+    double boundary_magsf = boundary_mag_sf[start_index];
+    double boundary_alpha = boundary_thermo_alpha[start_index];
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    double own_cell_sumYDiffError_x = sumYDiffError[index * 3 + 0];
-    double own_cell_sumYDiffError_y = sumYDiffError[index * 3 + 1];
-    double own_cell_sumYDiffError_z = sumYDiffError[index * 3 + 2];
-
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        double phiUc_face = 0;
-
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = weight[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0];
-        double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1];
-        double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2];
-        double face_x = w * (neighbor_cell_sumYDiffError_x - own_cell_sumYDiffError_x) + own_cell_sumYDiffError_x;
-        double face_y = w * (neighbor_cell_sumYDiffError_y - own_cell_sumYDiffError_y) + own_cell_sumYDiffError_y;
-        double face_z = w * (neighbor_cell_sumYDiffError_z - own_cell_sumYDiffError_z) + own_cell_sumYDiffError_z;
-
-        phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz;
-        phiUc[neighbor_index] = phiUc_face;
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        double phiUc_face = 0;
-
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[row_index + i];
-        double w = weight[neighbor_index];
-        double sfx = face_vector[neighbor_index * 3 + 0];
-        double sfy = face_vector[neighbor_index * 3 + 1];
-        double sfz = face_vector[neighbor_index * 3 + 2];
-        double neighbor_cell_sumYDiffError_x = sumYDiffError[neighbor_cell_id * 3 + 0];
-        double neighbor_cell_sumYDiffError_y = sumYDiffError[neighbor_cell_id * 3 + 1];
-        double neighbor_cell_sumYDiffError_z = sumYDiffError[neighbor_cell_id * 3 + 2];
-        double face_x = w * (own_cell_sumYDiffError_x - neighbor_cell_sumYDiffError_x) + neighbor_cell_sumYDiffError_x;
-        double face_y = w * (own_cell_sumYDiffError_y - neighbor_cell_sumYDiffError_y) + neighbor_cell_sumYDiffError_y;
-        double face_z = w * (own_cell_sumYDiffError_z - neighbor_cell_sumYDiffError_z) + neighbor_cell_sumYDiffError_z;
-
-        phiUc_face = face_x * sfx + face_y * sfy + face_z * sfz;
-        phiUc[neighbor_index] = phiUc_face;
+    double sum_boundary_ssf = 0;
+    for (int s = 0; s < num_species; s++) {
+        // sn_grad: solving according to fixedValue BC
+        double boundary_sngrad = boundary_delta_coeff * (boundary_vf[num_boundary_surfaces * s + start_index] - vf[num_cells * s + cellIndex]);
+        double boundary_gamma = boundary_alpha * boundary_hai[num_boundary_surfaces * s + start_index];
+        double boundary_ssf = boundary_gamma * boundary_sngrad * boundary_magsf;
+        sum_boundary_ssf += boundary_ssf;
     }
+
+    atomicAdd(&(output[cellIndex]), sum_boundary_ssf);
 }
 
-__global__ void calculate_phiUc_boundary(int num_boundary_faces,
-                                         const int *boundary_cell_offset, const int *boundary_cell_id,
-                                         const double *boundary_sf, const double *boundary_sumYDiffError,
-                                         double *boundary_phiUc)
+__global__ void yeqn_fvc_laplacian_scalar_boundary_cyclic(int num_species, int num_cells, int num_boundary_surfaces,
+        int num, int internal_offset, int neighbor_offset, const int *face2Cells,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_thermo_alpha, const double *boundary_hai,
+        const double *vf, const double *boundary_vf, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
+    if (index >= num)
         return;
 
-    double n_x = boundary_sf[index * 3 + 0];
-    double n_y = boundary_sf[index * 3 + 1];
-    double n_z = boundary_sf[index * 3 + 2];
-
-    double err_x = boundary_sumYDiffError[index * 3 + 0];
-    double err_y = boundary_sumYDiffError[index * 3 + 1];
-    double err_z = boundary_sumYDiffError[index * 3 + 2];
-
-    boundary_phiUc[index] = n_x * err_x + n_y * err_y + n_z * err_z;
-}
+    int internal_start_index = internal_offset + index;
+    int neighbor_start_index = neighbor_offset + index;
 
-__global__ void fvm_ddt_kernel_scalar(int num_cells, int num_faces, int num_species, int inertIndex, const double rdelta_t,
-                                      const int *csr_row_index, const int *csr_diag_index,
-                                      const double *rho_old, const double *rho_new, const double *volume, const double *species_old,
-                                      const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
+    int internal_cellIndex = face2Cells[internal_start_index];
+    int neighbor_cellIndex = face2Cells[neighbor_start_index];
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int diag_index = csr_diag_index[index];
-    int csr_index = row_index + diag_index;
+    double boundary_delta_coeff = boundary_delta_coeffs[internal_start_index];
+    double boundary_magsf = boundary_mag_sf[internal_start_index];
+    double boundary_alpha = boundary_thermo_alpha[internal_start_index];
 
-    double ddt_diag = rdelta_t * rho_new[index] * volume[index];
-    double ddt_part_term = rdelta_t * rho_old[index] * volume[index];
-    int mtxIndex = 0;
+    double sum_boundary_ssf = 0;
     for (int s = 0; s < num_species; s++) {
-        if (s == inertIndex)
-            continue;
-        A_csr_output[mtxIndex * (num_cells + num_faces) + csr_index] =
-            A_csr_input[mtxIndex * (num_cells + num_faces) + csr_index] + ddt_diag;
-        b_output[mtxIndex * num_cells + index] =
-            b_input[mtxIndex * num_cells + index] + ddt_part_term * species_old[num_cells * s + index];
-        ++mtxIndex;
+        // sn_grad: solving according to coupled BC
+        double boundary_sngrad = boundary_delta_coeff *
+            (vf[num_cells * s + neighbor_cellIndex] - vf[num_cells * s + internal_cellIndex]);
+        double boundary_gamma = boundary_alpha * boundary_hai[num_boundary_surfaces * s + internal_start_index];
+        double boundary_ssf = boundary_gamma * boundary_sngrad * boundary_magsf;
+        sum_boundary_ssf += boundary_ssf;
     }
-}
-
-__global__ void compute_inertIndex_y(int num_cells, int num_species, int inertIndex, double *y)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
 
-    double sum_yi = 0;
-    for (int i = 0; i < num_species; i++)
-    {
-        if (i == inertIndex) continue;
-
-        double yi = y[num_cells * i + index];
-        sum_yi += yi > 0 ? yi : 0;
-    }
-    sum_yi = 1 - sum_yi;
-    y[num_cells * inertIndex + index] = (sum_yi > 0 ? sum_yi : 0);
+    atomicAdd(&(output[internal_cellIndex]), sum_boundary_ssf);
 }
 
-__global__ void fvm_div_internal_scalar(int num_cells, int num_faces, int num_species, int inertIndex,
-                                        const int *csr_row_index, const int *csr_diag_index,
-                                        const double *div_weight, const double *phi,
-                                        const double *A_csr_input, double *A_csr_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int next_row_index = csr_row_index[index + 1];
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    int mtxIndex = 0;
-    for (int s = 0; s < num_species; s++) {
-        if (s == inertIndex)
-            continue;
-    double div_diag = 0;
-    for (int i = row_index; i < next_row_index; i++)
-    {
-        int inner_index = i - row_index;
-        // lower
-        if (inner_index < diag_index)
-        {
-            int neighbor_index = neighbor_offset + inner_index;
-            double w = div_weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[mtxIndex * (num_cells + num_faces) + i] =
-                A_csr_input[mtxIndex * (num_cells + num_faces) + i] + (-w) * f;
-            // lower neighbors contribute to sum of -1
-            div_diag += (w - 1) * f;
-        }
-        // upper
-        if (inner_index > diag_index)
-        {
-            // upper, index - 1, consider of diag
-            int neighbor_index = neighbor_offset + inner_index - 1;
-            double w = div_weight[neighbor_index];
-            double f = phi[neighbor_index];
-            A_csr_output[mtxIndex * (num_cells + num_faces) + i] =
-                A_csr_input[mtxIndex * (num_cells + num_faces) + i] + (1 - w) * f;
-            // upper neighbors contribute to sum of 1
-            div_diag += w * f;
-        }
-    }
-    A_csr_output[mtxIndex * (num_cells + num_faces) + row_index + diag_index] =
-        A_csr_input[mtxIndex * (num_cells + num_faces) + row_index + diag_index] + div_diag; // diag
-        ++mtxIndex;
-    }
-}
-__global__ void fvm_div_boundary_scalar(int num_cells, int num_faces, int num_boundary_cells, int num_boundary_faces,
-                                        int num_species, int inertIndex,
-                                        const int *csr_row_index, const int *csr_diag_index, const double *boundary_phi,
-                                        const int *boundary_cell_offset, const int *boundary_cell_id,
-                                        double *internal_coeffs, const double *boundary_coeffs,
-                                        const double *A_csr_input, double *A_csr_output, const double *b_input, double *b_output)
+__global__ void yeqn_fvc_laplacian_scalar_boundary_processor(int num_species, int num_cells, int num_boundary_surfaces,
+        int num, int offset, const int *face2Cells,
+        const double *boundary_mag_sf, const double *boundary_delta_coeffs, const double *boundary_weight,
+        const double *boundary_thermo_alpha, const double *boundary_hai,
+        const double *vf, const double *boundary_vf, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int cell_index = boundary_cell_id[cell_offset];
-    int loop_size = boundary_cell_offset[index + 1] - cell_offset;
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+    int cellIndex = face2Cells[neighbor_start_index];
 
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_dim = num_cells + num_faces;
-    int csr_index = row_index + diag_index;
+    double boundary_magsf = boundary_mag_sf[neighbor_start_index];
+    double boundary_delta_coeff = boundary_delta_coeffs[neighbor_start_index];
+    double boundary_w = boundary_weight[neighbor_start_index];
+    double boundary_thermo_alpha_owner = boundary_thermo_alpha[internal_start_index];
+    double boundary_thermo_alpha_neighbor = boundary_thermo_alpha[neighbor_start_index];
 
-    int mtxIndex = 0;
+    double sum_boundary_ssf = 0;
     for (int s = 0; s < num_species; s++) {
-        if (s == inertIndex)
-            continue;
-    // construct internalCoeffs & boundaryCoeffs
-    double internal_coeffs_own = 0;
-    double boundary_coeffs_own = 0;
-    for (int i = 0; i < loop_size; i++)
-    {
-        internal_coeffs_own += boundary_phi[cell_offset + i] * internal_coeffs[num_boundary_faces * s + cell_offset + i];
-        boundary_coeffs_own += -boundary_phi[cell_offset + i] * boundary_coeffs[num_boundary_faces * s + cell_offset + i];
-    }
-    A_csr_output[mtxIndex * (num_cells + num_faces) + csr_index] =
-        A_csr_input[mtxIndex * (num_cells + num_faces) + csr_index] + internal_coeffs_own;
-    b_output[mtxIndex * num_cells + cell_index] =
-        b_input[mtxIndex * num_cells + cell_index] + boundary_coeffs_own;
-        ++mtxIndex;
+        double boundary_haii_owner = boundary_hai[num_boundary_surfaces * s + internal_start_index];
+        double boundary_haii_neighbor = boundary_hai[num_boundary_surfaces * s + neighbor_start_index];
+        double boundary_sngrad = boundary_delta_coeff *
+            (boundary_vf[num_boundary_surfaces * s + neighbor_start_index] - vf[num_cells * s + cellIndex]);
+        double boundary_gamma = boundary_w * (boundary_thermo_alpha_owner * boundary_haii_owner)
+            + (1 - boundary_w) * (boundary_thermo_alpha_neighbor * boundary_haii_neighbor);
+        double boundary_ssf = boundary_gamma * boundary_sngrad * boundary_magsf;
+        sum_boundary_ssf += boundary_ssf;
     }
-}
-
-__global__ void fvm_laplacian_uncorrected_scalar_internal(int num_cells, int num_faces, int num_species, int inertIndex,
-                                                          const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-                                                          const double *mut_sct, const double *rhoD, const double *weight,
-                                                          const double *magsf, const double *distance,
-                                                          const double sign, const double *A_csr_input, double *A_csr_output)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
-    int mtxIndex = 0;
-    for (int s = 0; s < num_species; s++) {
-    if (s == inertIndex) continue;
-    double own_coeff = mut_sct[index] + rhoD[num_cells * s + index];
-    double sum_diag = 0;
-    // lower
-    for (int i = 0; i < diag_index; i++)
-    {
-        int neighbor_index = neighbor_offset + i;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_coeff = mut_sct[neighbor_cell_id] + rhoD[num_cells * s + neighbor_cell_id];
-        double gamma = w * (nei_coeff - own_coeff) + own_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[mtxIndex * (num_cells + num_faces) + row_index + i] =
-            A_csr_input[mtxIndex * (num_cells + num_faces) + row_index + i] + coeff * sign;
-
-        sum_diag += (-coeff);
-    }
-    // upper
-    for (int i = diag_index + 1; i < row_elements; i++)
-    {
-        int neighbor_index = neighbor_offset + i - 1;
-        int neighbor_cell_id = csr_col_index[i + row_index];
-        double w = weight[neighbor_index];
-        double nei_coeff = mut_sct[neighbor_cell_id] + rhoD[num_cells * s + neighbor_cell_id];
-        double gamma = w * (own_coeff - nei_coeff) + nei_coeff;
-        double gamma_magsf = gamma * magsf[neighbor_index];
-        double coeff = gamma_magsf * distance[neighbor_index];
-        A_csr_output[mtxIndex * (num_cells + num_faces) + row_index + i] =
-            A_csr_input[mtxIndex * (num_cells + num_faces) + row_index + i] + coeff * sign;
-
-        sum_diag += (-coeff);
-    }
-    // diag
-    A_csr_output[mtxIndex * (num_cells + num_faces) + row_index + diag_index] =
-        A_csr_input[mtxIndex * (num_cells + num_faces) + row_index + diag_index] + sum_diag * sign;
-    ++mtxIndex;
-    }
+    atomicAdd(&(output[cellIndex]), sum_boundary_ssf);
 }
 
-__global__ void fvm_laplacian_uncorrected_scalar_boundary(int num_cells, int num_faces, int num_boundary_cells, int num_boundary_faces,
-        int num_species, int inertIndex,
-        const int *csr_row_index, const int *csr_diag_index, const int *boundary_cell_offset,
-        const int *boundary_cell_id, const double *boundary_mut_sct, const double *boundary_rhoD,
-        const double *boundary_magsf, const int *bouPermedIndex,
-        const double *gradient_internal_coeffs, const double *gradient_boundary_coeffs,
-        const double sign, const double *A_csr_input, const double *b_input, double *A_csr_output, double *b_output)
+__global__ void yeqn_buildBC_scalar(int num_boundary_surfaces,
+        const int *face2Cells, const double *output, double *boundary_output)
+
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
+    if (index >= num_boundary_surfaces)
         return;
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    int row_index = csr_row_index[cell_index];
-    int diag_index = csr_diag_index[cell_index];
-    int csr_index = row_index + diag_index;
-
-    int mtxIndex = 0;
-    for (int s = 0; s < num_species; s++) {
-        if (s == inertIndex) continue;
-        double internal_coeffs = 0;
-        double boundary_coeffs = 0;
-        for (int i = cell_offset; i < next_cell_offset; i++)
-        {
-            int permute_index = bouPermedIndex[i];
-            double gamma = boundary_mut_sct[permute_index] + boundary_rhoD[num_boundary_faces * s + permute_index];
-            double gamma_magsf = gamma * boundary_magsf[i];
-            internal_coeffs += gamma_magsf * gradient_internal_coeffs[num_boundary_faces * s + i];
-            boundary_coeffs -= gamma_magsf * gradient_boundary_coeffs[num_boundary_faces * s + i];
-        }
-
-        A_csr_output[mtxIndex * (num_cells + num_faces) + csr_index] =
-            A_csr_input[mtxIndex * (num_cells + num_faces) + csr_index] + internal_coeffs * sign;
-        b_output[mtxIndex * num_cells + cell_index] =
-            b_input[mtxIndex * num_cells + cell_index] + boundary_coeffs * sign;
-        ++mtxIndex;
-    }
+    int cellIndex = face2Cells[index];
+    boundary_output[index] = output[cellIndex];
 }
 
-__global__ void fvc_laplacian_internal(int num_cells, int num_species,
-        const int *csr_row_index, const int *csr_col_index, const int *csr_diag_index,
-        const double *alpha, const double *hai, const double* y,
-        const double *weight, const double *magsf, const double *distance,
-        const double* volume, double *output)
+__global__ void yeqn_divide_cell_volume_scalar(int num_cells, const double* volume, double *output)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
     if (index >= num_cells)
         return;
 
-    // A_csr has one more element in each row: itself
-    int row_index = csr_row_index[index];
-    int row_elements = csr_row_index[index + 1] - row_index;
-    int diag_index = csr_diag_index[index];
-    int neighbor_offset = csr_row_index[index] - index;
-
     double vol = volume[index];
-    double sum_all_species = 0;
-	for (int s = 0; s < num_species; s++) {
-		double own_vf = y[num_cells * s + index];
-		double own_coeff = alpha[index] * hai[num_cells * s + index];
-		double sum = 0;
-		// lower
-		for (int i = 0; i < diag_index; i++)
-		{
-			int neighbor_index = neighbor_offset + i;
-			int neighbor_cell_id = csr_col_index[i + row_index];
-			double w = weight[neighbor_index];
-			double nei_vf = y[num_cells * s + neighbor_cell_id];
-			double nei_coeff = alpha[neighbor_cell_id] * hai[num_cells * s + neighbor_cell_id];
-			double face_gamma = (1 - w) * own_coeff + w * nei_coeff;
-			double sngrad = distance[neighbor_index] * (own_vf - nei_vf);
-			double value = face_gamma * sngrad * magsf[neighbor_index];
-			sum -= value;
-		}
-		// upper
-		for (int i = diag_index + 1; i < row_elements; i++)
-		{
-			int neighbor_index = neighbor_offset + i - 1;
-			int neighbor_cell_id = csr_col_index[i + row_index];
-			double w = weight[neighbor_index];
-			double nei_vf = y[num_cells * s + neighbor_cell_id];
-			double nei_coeff = alpha[neighbor_cell_id] * hai[num_cells * s + neighbor_cell_id];
-			double face_gamma = w * own_coeff + (1 - w) * nei_coeff;
-			double sngrad = distance[neighbor_index] * (nei_vf - own_vf);
-			double value = face_gamma * sngrad * magsf[neighbor_index];
-			sum += value;
-		}
-		sum_all_species += sum;
-	}
-	output[index] = sum_all_species / vol;
+
+    output[index] = output[index] / vol;
 }
 
-__global__ void yeqn_update_BoundaryCoeffs_kernel(int num_boundary_faces, int num_species,
-                                                  const double *boundary_phi, double *internal_coeffs,
-                                                  double *boundary_coeffs, double *laplac_internal_coeffs,
-                                                  double *laplac_boundary_coeffs, const int *Y_patch_type,
-                                                  const double *boundary_Y, const double *boundary_deltaCoeffs,
-                                                  const int* bouPermedIndex, bool uploadData)
+__global__ void yeqn_compute_y_inertIndex_kernel(int num_species, int inertIndex, int num_cells, double *y)
 {
     int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_faces)
+    if (index >= num_cells)
         return;
 
-    int patchIndex = Y_patch_type[index];
-    double valueInternalCoeffs, valueBoundaryCoeffs, gradientInternalCoeffs, gradientBoundaryCoeffs;
+    double sum_y = 0;
     for (int s = 0; s < num_species; s++) {
-        if (patchIndex == 0) { // zeroGradient
-            valueInternalCoeffs = 1.;
-            valueBoundaryCoeffs = 0.;
-            gradientInternalCoeffs = 0.;
-            gradientBoundaryCoeffs = 0.;
-        } else if (patchIndex == 1) { // fixedValue
-            if (!uploadData) {
-                    valueInternalCoeffs = 0.;
-                    valueBoundaryCoeffs = boundary_Y[index + s * num_boundary_faces];
-                    gradientInternalCoeffs = -1 * boundary_deltaCoeffs[index];
-                    gradientBoundaryCoeffs = boundary_Y[index + s * num_boundary_faces] * boundary_deltaCoeffs[index];   
-                } else {
-                    int permute_index = bouPermedIndex[index];
-                    valueInternalCoeffs = 0.;
-                    valueBoundaryCoeffs = boundary_Y[permute_index + s * num_boundary_faces];
-                    gradientInternalCoeffs = -1 * boundary_deltaCoeffs[index];
-                    gradientBoundaryCoeffs = boundary_Y[permute_index + s * num_boundary_faces] * boundary_deltaCoeffs[index];
-                }
-        } else if (patchIndex == 2) { // empty
-            valueInternalCoeffs = 0.;
-            valueBoundaryCoeffs = 0.;
-            gradientInternalCoeffs = 0.;
-            gradientBoundaryCoeffs = 0.;
-        }
-        internal_coeffs[num_boundary_faces * s + index] = valueInternalCoeffs;
-        boundary_coeffs[num_boundary_faces * s + index] = valueBoundaryCoeffs;
-        laplac_internal_coeffs[num_boundary_faces * s + index] = gradientInternalCoeffs;
-        laplac_boundary_coeffs[num_boundary_faces * s + index] = gradientBoundaryCoeffs;
-        
-    }
-}
+        if (s == inertIndex) continue;
 
-__global__ void yeqn_correct_BoundaryConditions_kernel(int num_cells, int num_boundary_cells, int num_boundary_faces, int num_species,
-                                                       const int *boundary_cell_offset, const int *boundary_cell_id,
-                                                       const double *species, double *boundary_species, const int *Y_patch_type)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_cells)
-        return;
+        double yi = y[num_cells * s + index];
 
-    int cell_offset = boundary_cell_offset[index];
-    int next_cell_offset = boundary_cell_offset[index + 1];
-    int cell_index = boundary_cell_id[cell_offset];
-
-    for (int i = cell_offset; i < next_cell_offset; i++)
-    {
-        int patchIndex = Y_patch_type[i];
-
-        switch (patchIndex)
-        {
-            case 0: // zeroGradient
-            {
-                for (int speciesID = 0; speciesID < num_species; speciesID++)
-                {
-                    boundary_species[speciesID * num_boundary_faces + i] = species[speciesID * num_cells + cell_index];
-                }
-                break;
-            }
-            // case 1:
-            //     break;
-            // TODO implement coupled conditions
-        }
+        y[num_cells * s + index] = yi > 0 ? yi : 0;
+        sum_y += yi > 0 ? yi : 0;
     }
+    sum_y = 1 - sum_y;
+    y[num_cells * inertIndex + index] = (sum_y > 0 ? sum_y : 0);
 }
 
-__global__ void yeqn_calculate_rhoD_alpha_via_nuEff_internal(int num_cells, int num_species, double *rhoD, const double *nuEff,
-        const double *rho, double *alpha)
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_cells)
-        return;
-    
-    // // rhoD = alpha (UnityLewis)
-    // // alpha = nu * rho / 0.7
-    // for (int i = 0; i < num_species; i++) {
-    //     rhoD[i * num_cells + index] = nuEff[index] * rho[index] / 0.7;
-    // }
-
-    alpha[index] = rhoD[index];
-}
+double* dfYEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
 
-__global__ void yeqn_calculate_rhoD_alpha_via_nuEff_boundary(int num_boundary_face, int num_species, int *permutIndex,
-        double *boundary_rhoD, const double *boundary_nuEff, const double *boundary_rho, double *boundary_alpha)    
-{
-    int index = blockDim.x * blockIdx.x + threadIdx.x;
-    if (index >= num_boundary_face)
-        return;
-    
-    // for (int i = 0; i < num_species; i++) {
-    //     boundary_rhoD[i * num_boundary_face + index] = boundary_nuEff[index] * boundary_rho[index] / 0.7;
-    // }
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
 
-    boundary_alpha[index] = boundary_rhoD[permutIndex[index]];
+    return pointer;
 }
 
-dfYEqn::dfYEqn(dfMatrixDataBase &dataBase, const std::string &modeStr, const std::string &cfgFile, const int inertIndex)
-    : dataBase_(dataBase), inertIndex(inertIndex)
-{
-    stream = dataBase_.stream;
-    num_species = dataBase_.num_species;
-    num_cells = dataBase_.num_cells;
-    num_faces = dataBase_.num_faces;
-    num_surfaces = dataBase_.num_surfaces;
-    num_boundary_cells = dataBase_.num_boundary_cells;
-    num_boundary_faces = dataBase_.num_boundary_faces;
-    cell_bytes = dataBase_.cell_bytes;
-    boundary_face_bytes = dataBase_.boundary_face_bytes;
-    d_rhoD = dataBase_.d_rhoD;
-    d_boundary_rhoD = dataBase_.d_boundary_rhoD;
-    d_alpha = dataBase_.d_alpha;
-    
-
-    YSolverSet.resize(num_species - 1); // consider inert species
+void dfYEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path, const int inertIndex) {
+    this->stream = dataBase_.stream;
+    this->mode_string = mode_string;
+    this->setting_path = setting_path;
+    this->inertIndex = inertIndex;
+    YSolverSet.resize(dataBase_.num_species - 1); // consider inert species
     for (auto &solver : YSolverSet)
-        solver = new AmgXSolver(modeStr, cfgFile);
-
-    d_A_csr_row_index = dataBase_.d_A_csr_row_index;
-    d_A_csr_diag_index = dataBase_.d_A_csr_diag_index;
-    d_A_csr_col_index = dataBase_.d_A_csr_col_index;
-
-    h_A_csr = new double[(num_cells + num_faces) * (num_species - 1)];
-    h_b = new double[num_cells * (num_species - 1)];
-    cudaMallocHost(&h_psi, num_cells * num_species * sizeof(double));
-
-    checkCudaErrors(cudaMalloc((void **)&d_A_csr, (num_cells + num_faces) * (num_species - 1) * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void **)&d_b, cell_bytes * (num_species - 1)));
-    checkCudaErrors(cudaMalloc((void **)&d_psi, cell_bytes * (num_species - 1)));
-    checkCudaErrors(cudaMalloc((void **)&d_phiUc, num_faces * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void **)&d_phiUc_boundary, num_boundary_faces * sizeof(double)));
-    checkCudaErrors(cudaMalloc((void **)&d_mut_Sct, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_mut_sct, boundary_face_bytes));
-
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_Y, boundary_face_bytes * num_species));
-
-    checkCudaErrors(cudaMalloc((void **)&d_hai, cell_bytes * num_species));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_hai, boundary_face_bytes * num_species));
-    // checkCudaErrors(cudaMalloc((void **)&d_rhoD, cell_bytes * num_species));
-    // checkCudaErrors(cudaMalloc((void **)&d_boundary_rhoD, boundary_face_bytes * num_species));
-
-    checkCudaErrors(cudaMalloc((void **)&d_sum_rhoD_grady, 3 * cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_rhoD_grady, 3 * boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sum_hai_rhoD_grady, 3 * cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_rhoD_grady, 3 * boundary_face_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sum_hai_y, cell_bytes));
-    checkCudaErrors(cudaMalloc((void **)&d_sum_boundary_hai_y, boundary_face_bytes));
-
-    checkCudaErrors(cudaMalloc((void **)&d_grady, 3 * cell_bytes * num_species));
-    checkCudaErrors(cudaMalloc((void **)&d_boundary_grady, 3 * boundary_face_bytes * num_species));
-
-    // checkCudaErrors(cudaMalloc((void **)&d_alpha, cell_bytes));
-
-    // zeroGradient
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_internal_coeffs_Y, 1, boundary_face_bytes * num_species, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_coeffs_Y, 0, boundary_face_bytes * num_species, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_internal_coeffs_Y, 0, boundary_face_bytes * num_species, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_laplac_boundary_coeffs_Y, 0, boundary_face_bytes * num_species, stream));
-
+        solver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
+}
+
+void dfYEqn::setConstantFields(const std::vector<int> patch_type, const std::vector<double> lewis_number) {
+    this->patch_type = patch_type;
+    this->lewis_number = lewis_number;
+    // print lewis number
+    checkCudaErrors(cudaMalloc((void**)&d_lewis_number, dataBase_.num_species * sizeof(double)));
+    checkCudaErrors(cudaMemcpy(d_lewis_number, lewis_number.data(), dataBase_.num_species * sizeof(double), cudaMemcpyHostToDevice));
+}
+
+void dfYEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
+    // thermophysical fields
+    checkCudaErrors(cudaMalloc((void**)&d_hai, dataBase_.cell_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_mut_sct, dataBase_.cell_value_bytes));
+    // intermediate fields
+    checkCudaErrors(cudaMalloc((void**)&d_grad_y, dataBase_.cell_value_vec_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_sumY_diff_error, dataBase_.cell_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_phiUc, dataBase_.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_permute, dataBase_.cell_value_vec_bytes));
+#endif
+    checkCudaErrors(cudaMalloc((void**)&d_RR, dataBase_.cell_value_bytes * dataBase_.num_species));
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_rhoD, dataBase_.cell_value_bytes * dataBase_.num_species));
     // UnityLewis
-    checkCudaErrors(cudaMemsetAsync(d_hai, 0, cell_bytes * num_species, stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0,  boundary_face_bytes * num_species, stream));
-    checkCudaErrors(cudaMemsetAsync(d_mut_Sct, 0, cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0,  boundary_face_bytes, stream));
-}
-
-void dfYEqn::initializeTimeStep()
-{
-    // consider inert species
-    // initialize matrix value
-    checkCudaErrors(cudaMemsetAsync(d_A_csr, 0, (num_cells + num_faces) * (num_species - 1) * sizeof(double), stream));
-    checkCudaErrors(cudaMemsetAsync(d_b, 0, cell_bytes * (num_species - 1), stream));
-    // initialize variables in each time step
-    checkCudaErrors(cudaMemsetAsync(d_psi, 0, cell_bytes * (num_species - 1), stream));
-
-    // // initialize boundary coeffs
-    // size_t threads_per_block = 1024;
-    // size_t blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    // yeqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-    //         num_boundary_faces, num_species,
-    //         dataBase_.d_boundary_phi,
-    //         dataBase_.d_internal_coeffs_Y,
-    //         dataBase_.d_boundary_coeffs_Y,
-    //         dataBase_.d_laplac_internal_coeffs_Y,
-    //         dataBase_.d_laplac_boundary_coeffs_Y,
-    //         dataBase_.d_boundary_YpatchType,
-    //         d_boundary_Y, 
-    //         dataBase_.d_boundary_deltaCoeffs);
-}
-
-void dfYEqn::upwindWeight()
-{
-    size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_faces + threads_per_block - 1) / threads_per_block;
-    getUpwindWeight<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_faces, dataBase_.d_phi, dataBase_.d_weight_upwind);
-}
-
-void dfYEqn::fvm_laplacian_and_sumYDiffError_diffAlphaD_hDiffCorrFlux(std::vector<double *> Y_old, std::vector<double *> boundary_Y,
-        std::vector<const double *> hai, std::vector<double *> boundary_hai,
-        std::vector<const double *> rhoD, std::vector<double *> boundary_rhoD,
-        const double *mut_Sct, const double *boundary_mut_Sct, const double *alpha)
-{
-    clock_t start = std::clock();
-    // initialize variables in each time step
-    // checkCudaErrors(cudaMemcpyAsync(d_boundary_mut_sct, boundary_mut_Sct, boundary_face_bytes, cudaMemcpyHostToDevice, stream));
-    // checkCudaErrors(cudaMemcpyAsync(d_mut_Sct, mut_Sct, cell_bytes, cudaMemcpyHostToDevice, stream));
-    // checkCudaErrors(cudaMemcpyAsync(d_alpha, alpha, cell_bytes, cudaMemcpyHostToDevice, stream));
-
-    checkCudaErrors(cudaMemsetAsync(d_sum_rhoD_grady, 0, 3 * cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_rhoD_grady, 0, 3 * boundary_face_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_sum_hai_rhoD_grady, 0, 3 * cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_rhoD_grady, 0, 3 * boundary_face_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_sum_hai_y, 0, cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(d_sum_boundary_hai_y, 0, boundary_face_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_hDiffCorrFlux, 0, 3 * cell_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_hDiffCorrFlux, 0, 3 * boundary_face_bytes, stream));
-    checkCudaErrors(cudaMemsetAsync(dataBase_.d_diffAlphaD, 0, cell_bytes, stream));
-
-    size_t threads_per_block, blocks_per_grid;
-    for (size_t i = 0; i < num_species; ++i)
-    {
-        if (uploadData)
-        {
-            checkCudaErrors(cudaMemcpyAsync(dataBase_.d_Y + i * num_cells, Y_old[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-            checkCudaErrors(cudaMemcpyAsync(d_boundary_Y + i * num_boundary_faces, boundary_Y[i], boundary_face_bytes,
-                        cudaMemcpyHostToDevice, stream));
+    checkCudaErrors(cudaMallocHost((void**)&h_hai, dataBase_.cell_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_mut_sct, dataBase_.cell_value_bytes));
+    // getter for h_xxx
+    // UnityLewis
+    fieldPointerMap["h_rhoD"] = h_rhoD;
+    fieldPointerMap["h_hai"] = h_hai;
+    fieldPointerMap["h_mut_sct"] = h_mut_sct;
+}
+
+void dfYEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+    // thermophysical fields
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_hai, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_mut_sct, dataBase_.boundary_surface_value_bytes));
+    // intermediate fields
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_grad_y, dataBase_.boundary_surface_value_vec_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_sumY_diff_error, dataBase_.boundary_surface_value_vec_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phiUc, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_DEff, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_permute, dataBase_.boundary_surface_value_vec_bytes));
+    // boundary coeff fields
+    checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+#endif
+    // computed on CPU, used on GPU, need memcpyh2d
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_rhoD, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    // UnityLewis
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_hai, dataBase_.boundary_surface_value_bytes * dataBase_.num_species));
+    checkCudaErrors(cudaMallocHost((void**)&h_boundary_mut_sct, dataBase_.boundary_surface_value_bytes));
+    // getter for h_boundary_xxx
+    fieldPointerMap["h_boundary_rhoD"] = h_boundary_rhoD;
+    // UnityLewis
+    fieldPointerMap["h_boundary_hai"] = h_boundary_hai;
+    fieldPointerMap["h_boundary_mut_sct"] = h_boundary_mut_sct;
+}
+
+void dfYEqn::createNonConstantLduAndCsrFields() {
+    checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
+    d_lower = d_ldu;
+    d_diag = d_ldu + dataBase_.num_surfaces;
+    d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
+    d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes));
+    // use d_source as d_b
+    //checkCudaErrors(cudaMalloc((void**)&d_b, dataBase_.cell_value_bytes));
+#endif
+}
+
+void dfYEqn::initNonConstantFieldsInternal(const double *y) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_y, y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream));
+}
+
+void dfYEqn::initNonConstantFieldsBoundary(const double *boundary_y) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_y, boundary_y, dataBase_.boundary_surface_value_bytes* dataBase_.num_species, cudaMemcpyHostToDevice, dataBase_.stream));
+
+    //for (int s = 0; s < dataBase_.num_species; s++) {
+    //    update_boundary_coeffs_scalar(dataBase_.stream,
+    //            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+    //            dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s,
+    //            d_value_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+    //            d_value_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
+    //            d_gradient_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+    //            d_gradient_boundary_coeffs + dataBase_.num_boundary_surfaces * s);
+    //}
+}
+
+void dfYEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
+    if (graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance));
+        checkCudaErrors(cudaGraphDestroy(graph));
+    }
+#endif
+}
+
+void dfYEqn::preProcess(const double *h_rhoD, const double *h_boundary_rhoD,
+        const double *h_hai, const double *h_boundary_hai,
+        const double *h_mut_sct, const double *h_boundary_mut_sct) {
+    //DEBUG_TRACE;
+    //checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+    //DEBUG_TRACE;
+}
+
+void dfYEqn::process() {
+     TICK_INIT_EVENT;
+
+    // calculate reaction rates
+    TICK_START_EVENT;
+    checkCudaErrors(cudaMemset(d_RR, 0, dataBase_.cell_value_bytes * dataBase_.num_species));
+    yeqn_compute_RR(chemistrySolver_, dataBase_.stream, dataBase_.h_T, dataBase_.d_T, dataBase_.d_p, dataBase_.d_y, dataBase_.d_rho_old, d_RR);
+    TICK_END_EVENT(YEqn compute RR);
+
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+#ifdef STREAM_ALLOCATOR
+        // thermophysical fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_hai, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_mut_sct, dataBase_.cell_value_bytes, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_grad_y, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_sumY_diff_error, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_phiUc, dataBase_.surface_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_DEff, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_permute, dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        // thermophysical fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_hai, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_mut_sct, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        // intermediate fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_grad_y, dataBase_.boundary_surface_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_sumY_diff_error, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiUc, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_DEff, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_permute, dataBase_.boundary_surface_value_vec_bytes, dataBase_.stream));
+        // boundary coeff fields
+        checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
+#endif
+
+        // UnityLewis
+        checkCudaErrors(cudaMemsetAsync(d_hai, 0, dataBase_.cell_value_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_hai, 0, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, dataBase_.stream));
+        // laminar
+        checkCudaErrors(cudaMemsetAsync(d_mut_sct, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_mut_sct, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_diff_alphaD, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_boundary_diff_alphaD, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_grad_y, 0, dataBase_.cell_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
+        checkCudaErrors(cudaMemsetAsync(d_boundary_grad_y, 0, dataBase_.boundary_surface_value_vec_bytes * dataBase_.num_species, dataBase_.stream));
+        // compute diffAlphaD
+        yeqn_fvc_laplacian_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_species, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, dataBase_.d_volume,
+                dataBase_.d_thermo_alpha, d_hai, dataBase_.d_y, dataBase_.d_diff_alphaD, // end for internal
+                dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_face_cell,
+                dataBase_.d_boundary_weight, dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_delta_coeffs,
+                dataBase_.d_boundary_thermo_alpha, d_boundary_hai, dataBase_.d_boundary_y, dataBase_.cyclicNeighbor.data(),
+                dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_diff_alphaD);
+        // fvc::grad(Yi)
+        for (int s = 0; s < dataBase_.num_species; s++) {
+            fvc_grad_cell_scalar_withBC(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                    dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                    dataBase_.d_owner, dataBase_.d_neighbor,
+                    dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_y + dataBase_.num_cells * s, d_grad_y + dataBase_.num_cells * s * 3,
+                    dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
+                    dataBase_.d_boundary_face_cell, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, dataBase_.d_boundary_sf,
+                    dataBase_.d_volume, dataBase_.d_boundary_mag_sf, d_boundary_grad_y + dataBase_.num_boundary_surfaces * s * 3,
+                    dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_delta_coeffs);
+            // update boundary coeffs
+            update_boundary_coeffs_scalar(dataBase_.stream,
+                    dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                    dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s,
+                    dataBase_.d_boundary_weight, 
+                    d_value_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_value_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_gradient_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_gradient_boundary_coeffs + dataBase_.num_boundary_surfaces * s);
         }
-        // checkCudaErrors(cudaMemcpyAsync(d_hai + i * num_cells, hai[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-        // checkCudaErrors(cudaMemcpyAsync(d_boundary_hai + i * num_boundary_faces, boundary_hai[i], boundary_face_bytes,
-        //             cudaMemcpyHostToDevice, stream));
-        // TODO: check why rhoD has to upload even in the UnityLewis case
-        checkCudaErrors(cudaMemcpyAsync(d_rhoD + i * num_cells, rhoD[i], cell_bytes, cudaMemcpyHostToDevice, stream));
-        checkCudaErrors(cudaMemcpyAsync(d_boundary_rhoD + i * num_boundary_faces, boundary_rhoD[i], boundary_face_bytes,
-                    cudaMemcpyHostToDevice, stream));
+        // compute sumYDiffError and hDiffCorrFlux
+        yeqn_compute_sumYDiffError_and_hDiffCorrFlux(dataBase_.stream,
+                dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
+                dataBase_.d_thermo_rhoD, d_hai, dataBase_.d_y, d_grad_y, 
+                d_sumY_diff_error, dataBase_.d_hDiff_corr_flux,
+                d_boundary_hai, dataBase_.d_boundary_y, d_boundary_grad_y, dataBase_.d_boundary_thermo_rhoD,
+                d_boundary_sumY_diff_error, dataBase_.d_boundary_hDiff_corr_flux);
+        // compute phiUc
+        yeqn_compute_phiUc(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor,
+                dataBase_.d_weight, dataBase_.d_sf, d_sumY_diff_error, d_phiUc,
+                dataBase_.d_boundary_sf, d_boundary_sumY_diff_error, d_boundary_phiUc);
+        // compute upwind weight of phi and phiUc: only need internal upwind-weight
+        compute_upwind_weight(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_phi, dataBase_.d_phi_weight);
+        // compute DEff
+        // UnityLewis
+        // tmp<volScalarField> DEff = chemistry->rhoD(i) + turbulence->mut()/Sct;
+        // turbulence->mut()/Sct = 0 when laminar.
+        // double *d_DEff = d_rhoD;
+        // double *d_boundary_DEff = d_boundary_rhoD;
+        // TODO: calculate d_DEff in dfThermo
+        // yeqn_compute_DEff_via_lewisNumber(dataBase_.stream, dataBase_.num_species, dataBase_.num_cells, dataBase_.num_boundary_surfaces,
+        //         d_lewis_number, dataBase_.d_thermo_alpha, d_mut_sct, d_DEff, dataBase_.d_boundary_thermo_alpha, d_boundary_mut_sct, d_boundary_DEff);
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance, graph, NULL, NULL, 0));
+        graph_created = true;
     }
-    // initialize boundary coeffs (must after the update of d_boundary_Y)
-    threads_per_block = 1024;
-    blocks_per_grid = (dataBase_.num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    yeqn_update_BoundaryCoeffs_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_boundary_faces, num_species,
-            dataBase_.d_boundary_phi,
-            dataBase_.d_internal_coeffs_Y,
-            dataBase_.d_boundary_coeffs_Y,
-            dataBase_.d_laplac_internal_coeffs_Y,
-            dataBase_.d_laplac_boundary_coeffs_Y,
-            dataBase_.d_boundary_YpatchType,
-            d_boundary_Y, 
-            dataBase_.d_boundary_deltaCoeffs,
-            dataBase_.d_bouPermedIndex,
-            uploadData);   
-
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    yeqn_calculate_rhoD_alpha_via_nuEff_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_species,
-            d_rhoD, dataBase_.d_nuEff, dataBase_.d_rho_new, d_alpha);
-    // // check rhoD
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    // double *h_rhoD = new double[num_cells];
-    // cudaMemcpy(h_rhoD, d_rhoD + num_cells * 5, num_cells * sizeof(double), cudaMemcpyDeviceToHost);
-    // for (int i = 0; i < num_boundary_faces; i++)
-    // {
-    //     printf("Y_H_rhoD[%d] = %e\n", i, h_rhoD[i]);
-    // }
-
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    yeqn_calculate_rhoD_alpha_via_nuEff_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_boundary_faces, num_species, dataBase_.d_bouPermedIndex,
-            d_boundary_rhoD, dataBase_.d_boundary_nuEff, dataBase_.d_boundary_rho, dataBase_.d_boundary_alpha);
-    clock_t end = std::clock();
-    fprintf(stderr, "GPU memcpy time in YEqn = %lf s\n", double(end - start) / double(CLOCKS_PER_SEC));
-
-    // fvc::grad(Yi)
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_species,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, dataBase_.d_weight, dataBase_.d_Y,
-            dataBase_.d_volume, d_grady);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvc_grad_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_boundary_cells, num_boundary_faces, num_species,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id, dataBase_.d_bouPermedIndex,
-            dataBase_.d_boundary_face_vector, d_boundary_Y,
-            dataBase_.d_volume, d_grady, d_grady, uploadData);
-    // check
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    // double *h_grady = new double[num_cells * 3];
-    // cudaMemcpy(h_grady, d_grady + num_cells * 3 * 5, (num_cells * 3) * sizeof(double), cudaMemcpyDeviceToHost);
-    // for (int i = 0; i < num_cells; i++)
-    // {
-    //     printf("d_grady[%d] = (%lf, %lf, %lf)\n", i, h_grady[i * 3], h_grady[i * 3 + 1], h_grady[i * 3 + 2]);
-    // }
-
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    correct_boundary_conditions<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_boundary_cells, num_boundary_faces, num_species,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, dataBase_.d_boundary_face,
-            d_grady, d_boundary_grady, dataBase_.d_boundary_deltaCoeffs, dataBase_.d_Y, d_boundary_Y, dataBase_.d_boundary_YpatchType);
-
-    // sum(chemistry->hai(i)*chemistry->rhoD(i)*fvc::grad(Yi))
-    // sum(chemistry->rhoD(i)*fvc::grad(Yi)), also be called sumYDiffError
-    // sum(chemistry->hai(i)*Yi)
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    sumError_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_species,
-            d_hai, d_rhoD, dataBase_.d_Y, d_grady,
-            d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y);
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    sumError_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_boundary_faces, num_species,
-            dataBase_.d_bouPermedIndex,
-            d_boundary_hai, d_boundary_rhoD, d_boundary_Y, d_boundary_grady,
-            d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y, uploadData);
-
-    // compute diffAlphaD
-    // TODO non-resonable, fvc_laplacian_internal will failed if threads_per_block = 1024
-    threads_per_block = 512;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvc_laplacian_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_species,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            d_alpha, d_hai, dataBase_.d_Y,
-            dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
-            dataBase_.d_volume, dataBase_.d_diffAlphaD);
-
-    // fvm::laplacian
-    // TODO non-resonable, fvm_laplacian_uncorrected_scalar_internal will failed if threads_per_block = 1024
-    threads_per_block = 512;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            d_mut_Sct, d_rhoD, dataBase_.d_weight, dataBase_.d_face, dataBase_.d_deltaCoeffs,
-            -1., d_A_csr, d_A_csr);
-    // TODO non-resonable, fvm_laplacian_uncorrected_scalar_boundary will failed if threads_per_block = 1024
-    threads_per_block = 512;
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_laplacian_uncorrected_scalar_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_boundary_cells, num_boundary_faces,
-            num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            d_boundary_mut_sct, d_boundary_rhoD, dataBase_.d_boundary_face, dataBase_.d_bouPermedIndex,
-            dataBase_.d_laplac_internal_coeffs_Y, dataBase_.d_laplac_boundary_coeffs_Y,
-            -1., d_A_csr, d_b, d_A_csr, d_b);
-
-    uploadData = false;
-
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    calculate_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_sum_hai_rhoD_grady, d_sum_rhoD_grady, d_sum_hai_y, dataBase_.d_hDiffCorrFlux);
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    calculate_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
-            d_sum_boundary_hai_rhoD_grady, d_sum_boundary_rhoD_grady, d_sum_boundary_hai_y, dataBase_.d_boundary_hDiffCorrFlux);
-}
-
-void dfYEqn::fvm_ddt()
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance, dataBase_.stream));
+#endif
+    // construct YiEqn and solve
+    // NOTE: ldu and yi can't be compared at the same time
+    // to compare ldu data, you should open both DEBUG_ and DEBUG_CHECK_LDU in src_gpu
+    // to compare yi, you should only open DEBUG_ in src_gpu.
+    // Besides, if you compare ldu data, be patient to keep specie_index in YEqn.H and dfYEqn.cu the same.
+// #define DEBUG_CHECK_LDU
+#if defined DEBUG_CHECK_LDU
+    int specie_index = 0;
+    for (int s = specie_index; s < specie_index + 1; s++) {
+#else
+    for (int s = 0; s < dataBase_.num_species; s++) {
+#endif
+        if (s != this->inertIndex) {
+            // reset ldu structures used cross YiEqn
+            checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
+            checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+            checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+            checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+            checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_bytes, dataBase_.stream));
+            // use d_source as d_b
+            //checkCudaErrors(cudaMemsetAsync(d_b, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+            // fvm::ddt(rho, Yi)
+            fvm_ddt_vol_scalar_vol_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t,
+                    dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_volume,
+                    d_diag, d_source, 1.);
+            // **calculate div weights with limitedLinear scheme**
+            // compute_limitedLinear_weight(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_surfaces, 
+            //         dataBase_.num_cells, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_mesh_dis,
+            //         dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_phi, dataBase_.d_phi_weight,
+            //         dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(), dataBase_.d_boundary_weight, dataBase_.d_boundary_face_cell,
+            //         dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, dataBase_.d_boundary_sf, dataBase_.d_volume, 
+            //         dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_phi, dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(),
+            //         dataBase_.d_boundary_delta_coeffs);
+
+            // fvmDiv(phi, Yi)
+            fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+                    dataBase_.d_phi, dataBase_.d_phi_weight,
+                    d_lower, d_upper, d_diag, // end for internal
+                    dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                    dataBase_.d_boundary_phi,
+                    d_value_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_value_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_internal_coeffs, d_boundary_coeffs, 1.);
+            // fvmDiv(phiUc, Yi)
+            fvm_div_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.d_owner, dataBase_.d_neighbor,
+                    d_phiUc, dataBase_.d_phi_weight,
+                    d_lower, d_upper, d_diag, // end for internal
+                    dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                    d_boundary_phiUc,
+                    d_value_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_value_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_internal_coeffs, d_boundary_coeffs, 1.);
+            // fvm::laplacian(DEff(), Yi)
+            fvm_laplacian_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                    dataBase_.d_owner, dataBase_.d_neighbor,
+                    dataBase_.d_weight, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs,
+                    dataBase_.d_thermo_rhoD + dataBase_.num_cells * s,
+                    d_lower, d_upper, d_diag, // end for internal
+                    dataBase_.num_patches, dataBase_.patch_size.data(), patch_type.data(),
+                    dataBase_.d_boundary_mag_sf, dataBase_.d_boundary_thermo_rhoD + dataBase_.num_boundary_surfaces * s,
+                    d_gradient_internal_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_gradient_boundary_coeffs + dataBase_.num_boundary_surfaces * s,
+                    d_internal_coeffs, d_boundary_coeffs, -1.);
+            fvc_to_source_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.d_volume, d_RR + dataBase_.num_cells * s, d_source);
+#ifndef DEBUG_CHECK_LDU
+            // ldu to csr
+            // use d_source as d_b
+            ldu_to_csr_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                    dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index, dataBase_.num_patches,
+                    dataBase_.patch_size.data(), patch_type.data(), d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
+            // TODO with solver of database_, solverIndex is no need any more.
+            //solverIndex ++;
+            solve(s);
+#endif
+        }
+        if (s == dataBase_.num_species - 1)
+            num_iteration++;
+    }
+    TICK_END_EVENT(YEqn assembly and solve for all species);
+
+    TICK_START_EVENT;
+    // compute y_inertIndex
+    yeqn_compute_y_inertIndex(dataBase_.stream, dataBase_.num_species, inertIndex, dataBase_.num_cells, dataBase_.d_y);
+    // correct boundary conditions
+    for (int s = 0; s < dataBase_.num_species; s++) {
+        correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(),
+                patch_type.data(), dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_face_cell,
+                dataBase_.d_y + dataBase_.num_cells * s, dataBase_.d_boundary_y + dataBase_.num_boundary_surfaces * s, 
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
+    }
+    TICK_END_EVENT(YEqn post process for all species correctBC);
+
+    TICK_START_EVENT;
+    // copy y and boundary_y to host
+    TICK_END_EVENT(YEqn post process for all species copy back);
+
+    TICK_START_EVENT;
+#ifdef STREAM_ALLOCATOR
+    // thermophysical fields
+    //checkCudaErrors(cudaFreeAsync(d_rhoD, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_hai, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_mut_sct, dataBase_.stream));
+    // intermediate fields
+    checkCudaErrors(cudaFreeAsync(d_grad_y, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_sumY_diff_error, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_phiUc, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_DEff, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_permute, dataBase_.stream));
+
+    // thermophysical fields
+    //checkCudaErrors(cudaFreeAsync(d_boundary_rhoD, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_hai, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_mut_sct, dataBase_.stream));
+    // intermediate fields
+    checkCudaErrors(cudaFreeAsync(d_boundary_grad_y, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_sumY_diff_error, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_phiUc, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_DEff, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_permute, dataBase_.stream));
+
+    // boundary coeff fields
+    checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+
+    checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+    checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+#endif
+    TICK_END_EVENT(YEqn post process for all species free);
+    sync();
+}
+
+void dfYEqn::solve(int speciesIndex) {    
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+    dataBase_.solve(num_iteration, AMGXSetting::u_setting, d_A, dataBase_.d_y + dataBase_.num_cells * speciesIndex, d_source);
+    TICK_END_EVENT(YEqn solve one specie);
+}
+
+void dfYEqn::postProcess(double *h_y, double *h_boundary_y) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_y, dataBase_.d_y, dataBase_.cell_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.h_boundary_y, dataBase_.d_boundary_y, dataBase_.boundary_surface_value_bytes * dataBase_.num_species, cudaMemcpyDeviceToHost, dataBase_.stream));
+    sync();
+}
+
+void dfYEqn::sync() {
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
+
+void dfYEqn::yeqn_compute_thermo_alpha(cudaStream_t stream,
+        int num_cells, const double *rhoD, double *thermo_alpha,
+        int num_boundary_surfaces, const double *boundary_rhoD, double *boundary_thermo_alpha)
 {
-    // fvm::ddt(rho, Yi)
-    size_t threads_per_block, blocks_per_grid;
-    threads_per_block = 1024;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_ddt_kernel_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_species, inertIndex,
-            dataBase_.rdelta_t,
-            d_A_csr_row_index, d_A_csr_diag_index,
-            dataBase_.d_rho_old, dataBase_.d_rho_new, dataBase_.d_volume, dataBase_.d_Y,
-            d_A_csr, d_b, d_A_csr, d_b);
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_thermo_alpha_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, rhoD, thermo_alpha);
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_thermo_alpha_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+            num_boundary_surfaces, boundary_rhoD, boundary_thermo_alpha);
 }
 
-void dfYEqn::fvm_div_phi()
+void dfYEqn::yeqn_compute_DEff_via_lewisNumber(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces, 
+        double *lewis_number, const double *alpha, const double *mut_sct, double *DEff,
+        const double *boundary_alpha, const double *boundary_mut_sct, double *boundary_DEff)
 {
-    // mvConvection->fvmDiv(phi, Yi)
-    size_t threads_per_block, blocks_per_grid;
-    threads_per_block = 512;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, dataBase_.d_phi,
-            d_A_csr, d_A_csr);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_boundary_cells, num_boundary_faces, num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_boundary_phi,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_internal_coeffs_Y, dataBase_.d_boundary_coeffs_Y,
-            d_A_csr, d_A_csr, d_b, d_b);
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_DEff_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_cells,
+            lewis_number, alpha, mut_sct, DEff);
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_DEff_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_boundary_surfaces,
+            lewis_number, boundary_alpha, boundary_mut_sct, boundary_DEff);
 }
 
-void dfYEqn::fvm_div_phiUc()
+void dfYEqn::yeqn_compute_RR(dfChemistrySolver& chemistrySolver, cudaStream_t stream, const double *h_T, const double *d_T,
+        const double *p, const double *y, const double *rho, double *RR)
 {
-    size_t threads_per_block, blocks_per_grid;
-
-    // compue phiUc
-    threads_per_block = 512;
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells,
-            d_A_csr_row_index, d_A_csr_col_index, d_A_csr_diag_index,
-            dataBase_.d_face_vector, dataBase_.d_weight, d_sum_rhoD_grady, d_phiUc);
-    blocks_per_grid = (num_boundary_faces + threads_per_block - 1) / threads_per_block;
-    calculate_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_faces,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_boundary_face_vector, d_sum_boundary_rhoD_grady, d_phiUc_boundary);
-
-    // mvConvection->fvmDiv(phiUc, Yi)
-    blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_internal_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_diag_index, dataBase_.d_weight_upwind, d_phiUc,
-            d_A_csr, d_A_csr);
-    blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    fvm_div_boundary_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(
-            num_cells, num_faces, num_boundary_cells, num_boundary_faces, num_species, inertIndex,
-            d_A_csr_row_index, d_A_csr_diag_index, d_phiUc_boundary,
-            dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-            dataBase_.d_internal_coeffs_Y, dataBase_.d_boundary_coeffs_Y,
-            d_A_csr, d_A_csr, d_b, d_b);
+    chemistrySolver.Inference(h_T, d_T, p, y, rho, RR);
 }
 
-void dfYEqn::checkValue(bool print, char *filename)
+void dfYEqn::yeqn_fvc_laplacian_scalar(cudaStream_t stream, ncclComm_t comm, const int *neighbor_peer,
+        int num_species, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *mag_sf, const double *delta_coeffs, const double *volume,
+        const double *thermo_alpha, const double *hai, const double *vf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const int *boundary_cell_face,
+        const double *boundary_weight, const double *boundary_mag_sf, const double *boundary_delta_coeffs,
+        const double *boundary_thermo_alpha, const double *boundary_hai, const double *boundary_vf,
+        const int *cyclicNeighbor, const int *patchSizeOffset, double *boundary_output)
 {
-    checkCudaErrors(cudaMemcpyAsync(h_A_csr, d_A_csr + (num_cells + num_faces) * 5, (num_cells + num_faces) * sizeof(double), cudaMemcpyDeviceToHost, stream)); // H
-    checkCudaErrors(cudaMemcpyAsync(h_b, d_b + num_cells * 5, num_cells * sizeof(double), cudaMemcpyDeviceToHost, stream)); // H
-
-    // Synchronize stream
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            fprintf(stderr, "h_A_csr[%d]: %.15lf\n", i, h_A_csr[i]);
-        for (int i = 0; i < num_cells; i++)
-            fprintf(stderr, "h_b[%d]: %.15lf\n", i, h_b[i]);
-    }
-
-    char *input_file = filename;
-    FILE *fp = fopen(input_file, "rb+");
-    if (fp == NULL)
-    {
-        fprintf(stderr, "Failed to open input file: %s!\n", input_file);
-    }
-
-    int readfile = 0;
-    double *of_b = new double[num_cells];
-    double *of_A = new double[num_faces + num_cells];
-    readfile = fread(of_b, num_cells * sizeof(double), 1, fp);
-    readfile = fread(of_A, (num_faces + num_cells) * sizeof(double), 1, fp);
-
-    std::vector<double> h_A_of_vec_1mtx(num_faces + num_cells, 0);
-    for (int i = 0; i < num_faces + num_cells; i++)
-    {
-        h_A_of_vec_1mtx[i] = of_A[dataBase_.tmpPermutatedList[i]];
-    }
-    if (print)
-    {
-        for (int i = 0; i < (num_faces + num_cells); i++)
-            printf("h_A_of_vec_1mtx[%d]: %.15lf\n", i, h_A_of_vec_1mtx[i]);
-        for (int i = 0; i < num_cells; i++)
-            printf("h_b_of_vec[%d]: %.15lf\n", i, of_b[i]);
-    }
-
-    fprintf(stderr, "check of h_A_csr\n");
-    checkVectorEqual(num_faces + num_cells, h_A_of_vec_1mtx.data(), h_A_csr, 1e-5);
-    fprintf(stderr, "check of h_b\n");
-    checkVectorEqual(num_cells, of_b, h_b, 1e-5);
-}
-
-void dfYEqn::solve()
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
-
-    int nNz = num_cells + num_faces; // matrix entries
-    if (num_iteration == 0)          // first interation
-    {
-        printf("Initializing AmgX Linear Solver\n");
-        int solverIndex = 0;
-        for (auto &solver : YSolverSet)
-        {
-            solver->setOperator(num_cells, nNz, d_A_csr_row_index, d_A_csr_col_index, d_A_csr + solverIndex * nNz);
-            ++solverIndex;
-        }
-    }
-    else
-    {
-        int solverIndex = 0;
-        for (auto &solver : YSolverSet)
-        {
-            solver->updateOperator(num_cells, nNz, d_A_csr + solverIndex * nNz);
-            ++solverIndex;
-        }
-    }
-    int mtxIndex = 0;
-    for (size_t i = 0; i < num_species; ++i)
-    {
-        if (i == inertIndex)
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_fvc_laplacian_scalar_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_cells, num_surfaces,
+            lowerAddr, upperAddr, mag_sf, delta_coeffs, weight, thermo_alpha, hai, vf, output);
+
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: just basic patch type now
+        if (patch_type[i] == boundaryConditions::zeroGradient) {
+            //fprintf(stderr, "patch_type is zeroGradient\n");
+            // snGrad of zeroGradient is 0, thus boundary is 0.
+        } else if (patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated) {
+            //fprintf(stderr, "patch_type is fixedValue\n");
+            yeqn_fvc_laplacian_scalar_boundary_fixedValue<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_species, num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+                    boundary_mag_sf, boundary_delta_coeffs,
+                    boundary_thermo_alpha, boundary_hai, vf, boundary_vf, output);
+        } else if (patch_type[i] == boundaryConditions::cyclic) {
+            yeqn_fvc_laplacian_scalar_boundary_cyclic<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_species, num_cells, num_boundary_surfaces, patch_size[i], offset, patchSizeOffset[cyclicNeighbor[i]], 
+                    boundary_cell_face, boundary_mag_sf, boundary_delta_coeffs,
+                    boundary_thermo_alpha, boundary_hai, vf, boundary_vf, output);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            yeqn_fvc_laplacian_scalar_boundary_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(
+                    num_species, num_cells, num_boundary_surfaces, patch_size[i], offset, boundary_cell_face,
+                    boundary_mag_sf, boundary_delta_coeffs, boundary_weight,
+                    boundary_thermo_alpha, boundary_hai, vf, boundary_vf, output);
+            offset += 2 * patch_size[i]; // patchNeighbourFields and patchInternalFields
             continue;
-
-        YSolverSet[mtxIndex]->solve(num_cells, dataBase_.d_Y + i * num_cells, d_b + mtxIndex * num_cells);
-        ++mtxIndex;
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
     }
 
-    size_t threads_per_block, blocks_per_grid;
+    // divide cell volume
     threads_per_block = 1024;
     blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
-    compute_inertIndex_y<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_species, inertIndex, dataBase_.d_Y);
-    checkCudaErrors(cudaMemcpyAsync(h_psi, dataBase_.d_Y, num_species * cell_bytes, cudaMemcpyDeviceToHost, stream));
-
-    num_iteration++;
-    // checkCudaErrors(cudaStreamSynchronize(stream));
-    // for (size_t i = 0; i < num_cells; i++)
-    //     fprintf(stderr, "h_species_gpu[%d]: %.5e\n", i, h_psi[i + 0 * num_cells]);
-}
-
-void dfYEqn::sync()
-{
-    checkCudaErrors(cudaStreamSynchronize(stream));
+    yeqn_divide_cell_volume_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, volume, output);
+
+    // TODO: correct boundary condition
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_buildBC_scalar<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces,
+            boundary_cell_face, output, boundary_output);
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        if (patch_type[i] == boundaryConditions::processor) {
+            correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset,
+                    output, boundary_cell_face, boundary_output);
+            offset += 2 * patch_size[i];
+        } else if (patch_type[i] == boundaryConditions::processorCyclic) {
+            correct_boundary_conditions_processor_scalar(stream, comm, neighbor_peer[i], patch_size[i], offset,
+                    output, boundary_cell_face, boundary_output);
+            offset += 2 * patch_size[i];
+        } else {
+            offset += patch_size[i];
+        }
+    }
 }
 
-void dfYEqn::updatePsi(double *Psi, int speciesIndex)
+void dfYEqn::yeqn_compute_sumYDiffError_and_hDiffCorrFlux(cudaStream_t stream, int num_species, int num_cells, int num_boundary_surfaces,
+        const double *rhoD, const double *hai, const double *y, const double *grad_y,
+        double *sumY_diff_error, double *hDiff_corr_flux,
+        const double *boundary_hai, const double *boundary_y, const double *boundary_grad_y, const double *boundary_rhoD,
+        double *boundary_sumY_diff_error, double *boundary_hDiff_corr_flux)
 {
-    checkCudaErrors(cudaStreamSynchronize(stream));
-    memcpy(Psi, h_psi + speciesIndex * num_cells, cell_bytes);
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    yeqn_sumError_and_compute_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_cells,
+            rhoD, hai, y, grad_y, sumY_diff_error, hDiff_corr_flux);
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_sumError_and_compute_hDiffCorrFlux<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, num_boundary_surfaces,
+            boundary_rhoD, boundary_hai, boundary_y, boundary_grad_y, boundary_sumY_diff_error, boundary_hDiff_corr_flux);
 }
 
-void dfYEqn::correctBoundaryConditions()
+void dfYEqn::yeqn_compute_phiUc(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces,
+        const int *lowerAddr, const int *upperAddr,
+        const double *weight, const double *sf, const double *sumY_diff_error, double *phiUc,
+        const double *boundary_sf, const double *boundary_sumY_diff_error, double *boundary_phiUc)
 {
     size_t threads_per_block = 1024;
-    size_t blocks_per_grid = (num_boundary_cells + threads_per_block - 1) / threads_per_block;
-    yeqn_correct_BoundaryConditions_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_boundary_cells, num_boundary_faces, num_species,
-                                                                                              dataBase_.d_boundary_cell_offset, dataBase_.d_boundary_cell_id,
-                                                                                              dataBase_.d_Y, d_boundary_Y, dataBase_.d_boundary_YpatchType);
-    // double *h_boundary_Y = new double[num_boundary_faces];
-    // cudaMemcpy(h_boundary_Y, d_boundary_Y, num_boundary_faces * sizeof(double), cudaMemcpyDeviceToHost);
-    // for (int i = 0; i < num_boundary_faces; i++)
-    // {
-    //     printf("h_boundary_GPU[%d] = %e\n", i, h_boundary_Y[i]);
-    // }
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_phiUc_internal<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces,
+            lowerAddr, upperAddr, weight, sf, sumY_diff_error, phiUc);
+    blocks_per_grid = (num_boundary_surfaces + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_phiUc_boundary<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces,
+            boundary_sf, boundary_sumY_diff_error, boundary_phiUc);
 }
 
-dfYEqn::~dfYEqn()
+void dfYEqn::yeqn_compute_y_inertIndex(cudaStream_t stream, int num_species, int inertIndex, int num_cells, double *y)
 {
-}
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    yeqn_compute_y_inertIndex_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_species, inertIndex, num_cells, y);
+}
+
+// #if defined DEBUG_
+void dfYEqn::comparediffAlphaD(const double *diffAlphaD, const double *boundary_diffAlphaD, bool printFlag)
+{
+    DEBUG_TRACE;
+    std::vector<double> h_diffAlphaD;
+    h_diffAlphaD.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diffAlphaD.data(), dataBase_.d_diff_alphaD, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diffAlphaD\n");
+    checkVectorEqual(dataBase_.num_cells, diffAlphaD, h_diffAlphaD.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+    std::vector<double> h_boundary_diffAlphaD;
+    h_boundary_diffAlphaD.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_diffAlphaD.data(), dataBase_.d_boundary_diff_alphaD, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_diffAlphaD\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_diffAlphaD, h_boundary_diffAlphaD.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+}
+
+void dfYEqn::comparegradyi(const double *grad_yi, const double *boundary_grad_yi, int specie_index, bool printFlag)
+{
+    DEBUG_TRACE;
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_grad_y + dataBase_.num_cells * specie_index * 3, d_permute);
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_grad_y + dataBase_.num_boundary_surfaces * specie_index * 3, d_boundary_permute);
+
+    std::vector<double> h_grad_yi;
+    h_grad_yi.resize(dataBase_.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_grad_yi.data(), d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_grad_yi\n");
+    checkVectorEqual(dataBase_.num_cells * 3, grad_yi, h_grad_yi.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+    std::vector<double> h_boundary_grad_yi;
+    h_boundary_grad_yi.resize(dataBase_.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_boundary_grad_yi.data(), d_boundary_permute, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_grad_yi\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_grad_yi, h_boundary_grad_yi.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+
+}
+
+void dfYEqn::comparesumYDiffError(const double *sumYDiffError, const double *boundary_sumYDiffError, bool printFlag)
+{
+    DEBUG_TRACE;
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, d_sumY_diff_error, d_permute);
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_boundary_surfaces, d_boundary_sumY_diff_error, d_boundary_permute);
+
+    std::vector<double> h_sumYDiffError;
+    h_sumYDiffError.resize(dataBase_.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_sumYDiffError.data(), d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_sumYDiffError\n");
+    checkVectorEqual(dataBase_.num_cells * 3, sumYDiffError, h_sumYDiffError.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+    std::vector<double> h_boundary_sumYDiffError;
+    h_boundary_sumYDiffError.resize(dataBase_.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_boundary_sumYDiffError.data(), d_boundary_permute, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_sumYDiffError\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_sumYDiffError, h_boundary_sumYDiffError.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+}
+
+void dfYEqn::comparehDiffCorrFlux(const double *hDiffCorrFlux, const double *boundary_hDiffCorrFlux, bool printFlag)
+{
+    DEBUG_TRACE;
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_cells, dataBase_.d_hDiff_corr_flux, d_permute);
+    permute_vector_d2h(dataBase_.stream, dataBase_.num_boundary_surfaces, dataBase_.d_boundary_hDiff_corr_flux, d_boundary_permute);
+
+    std::vector<double> h_hDiffCorrFlux;
+    h_hDiffCorrFlux.resize(dataBase_.num_cells * 3);
+    checkCudaErrors(cudaMemcpy(h_hDiffCorrFlux.data(), d_permute, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_hDiffCorrFlux\n");
+    checkVectorEqual(dataBase_.num_cells * 3, hDiffCorrFlux, h_hDiffCorrFlux.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+    std::vector<double> h_boundary_hDiffCorrFlux;
+    h_boundary_hDiffCorrFlux.resize(dataBase_.num_boundary_surfaces * 3);
+    checkCudaErrors(cudaMemcpy(h_boundary_hDiffCorrFlux.data(), d_boundary_permute, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_hDiffCorrFlux\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, boundary_hDiffCorrFlux, h_boundary_hDiffCorrFlux.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+}
+
+void dfYEqn::comparephiUc(const double *phiUc, const double *boundary_phiUc,  bool printFlag)
+{
+    DEBUG_TRACE;
+    std::vector<double> h_phiUc;
+    h_phiUc.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_phiUc.data(), d_phiUc, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_phiUc\n");
+    checkVectorEqual(dataBase_.num_surfaces, phiUc, h_phiUc.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+    std::vector<double> h_boundary_phiUc;
+    h_boundary_phiUc.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_phiUc.data(), d_boundary_phiUc, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_phiUc\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_phiUc, h_boundary_phiUc.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+}
+
+void dfYEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source,
+        const double *internal_coeffs, const double *boundary_coeffs, bool printFlag)
+{
+    DEBUG_TRACE;
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower\n");
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper\n");
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag\n");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source;
+    h_source.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source\n");
+    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_internal_coeffs;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_internal_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_coeffs;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+}
+
+void dfYEqn::compareYi(const double *yi, int specie_index, bool printFlag) {
+    DEBUG_TRACE;
+    std::vector<double> h_yi;
+    h_yi.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_yi.data(), dataBase_.d_y + dataBase_.num_cells * specie_index, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_y\n");
+    checkVectorEqual(dataBase_.num_cells, yi, h_yi.data(), 1e-10, printFlag);
+    DEBUG_TRACE;
+}
+// #endif
diff --git a/src_gpu/dfpEqn.H b/src_gpu/dfpEqn.H
new file mode 100644
index 000000000..79a414eac
--- /dev/null
+++ b/src_gpu/dfpEqn.H
@@ -0,0 +1,133 @@
+#pragma once
+
+#include "AmgXSolver.H"
+#include <amgx_c.h>
+#include "dfMatrixDataBase.H"
+#include "dfMatrixOpBase.H"
+
+class dfpEqn
+{
+private:
+	dfMatrixDataBase &dataBase_;
+
+    // cuda resource
+    cudaStream_t stream;
+#ifdef USE_GRAPH
+    // one graph for one eqn before using self-developed solver
+    cudaGraph_t graph_pre, graph_post;
+    cudaGraphExec_t graph_instance_pre, graph_instance_post;
+    bool pre_graph_created=false;
+    bool post_graph_created=false;
+#endif
+
+	// constant values -- basic
+	std::string mode_string;
+	std::string setting_path;
+
+	// constant values -- amgx solvers
+	AmgXSolver *pSolver = nullptr;
+    int num_iteration = 0;
+
+	// constant fields - internal
+
+	// constant fields - boundary
+	std::vector<int> patch_type_U;
+    std::vector<int> patch_type_p;
+
+	// non-constant fields - internal
+	// intermediate fields
+    double *d_rhorAUf = nullptr;
+    double *d_phiHbyA = nullptr;
+    double *d_flux = nullptr;
+
+	// non-constant fields - boundary
+    // intermediate boundary fields
+    double *d_boundary_rhorAUf = nullptr;
+    double *d_boundary_phiHbyA = nullptr;
+    double *d_boundary_flux = nullptr;
+    // boundary coeff fields
+	double *d_value_internal_coeffs = nullptr;
+	double *d_value_boundary_coeffs= nullptr;
+	double *d_gradient_internal_coeffs= nullptr;
+	double *d_gradient_boundary_coeffs= nullptr;
+
+	// non-constant fields - ldu
+    double *d_ldu = nullptr;
+	double *d_lower = nullptr;
+	double *d_upper = nullptr;
+	double *d_diag = nullptr;
+    double *d_extern = nullptr;
+	double *d_source = nullptr;
+	double *d_internal_coeffs = nullptr;
+	double *d_boundary_coeffs = nullptr;
+
+	// non-constant fields - csr
+	double *d_A = nullptr;
+
+    // field pointer map
+    std::unordered_map<std::string, double*> fieldPointerMap;
+
+public:
+	// constructor
+    dfpEqn(dfMatrixDataBase &dataBase)
+        : dataBase_(dataBase) {}
+
+	// destructor
+	  ~dfpEqn(){}
+
+	// member function
+
+    // getter function
+    double* getFieldPointer(const char* fieldAlias, location loc, position pos);
+
+	// initialization
+	void setConstantValues(const std::string &mode_string, const std::string &setting_path); 
+	void setConstantFields(const std::vector<int> patch_type_U, const std::vector<int> patch_type_p);
+    void initNonConstantFields(const double *p, const double *boundary_p);
+    void createNonConstantFieldsInternal();
+	void createNonConstantFieldsBoundary();
+	void createNonConstantLduAndCsrFields();
+	// dfUEqn has no internal non-constant fields to be init
+	//void initNonConstantFieldsInternal(xxx);
+
+    void cleanCudaResources();
+
+	// run equation
+	void preProcess(double *h_phi, double *h_boundary_phi);
+    void correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi); // tmp
+    void correctP(const double *h_p, double *h_boundary_p); // tmp
+    //void getFlux();
+	void process();
+	void postProcess();
+
+    // get intermediate variable
+    void getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *vf1, const double *vf2, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const double *boundary_vf1, const double *boundary_vf2, double *boundary_output, double sign = 1.);
+    void getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, double rDeltaT, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *u_old, const double *rho_old, const double *phi_old, const double *rho, 
+        const double *rhorAUf, const double *HbyA, const double *Sf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_Sf, const double *boundary_velocity_old, const double *boundary_rho, 
+        const double *boundary_rho_old, const double *boundary_phi_old, const double *boundary_rhorAUf, const double *boundary_HbyA,
+        const double *boundary_weight, double *boundary_output, double sign = 1.);
+    void correctionDiagMtxMultiTPsi(cudaStream_t stream, int num_cells, const double *psi, const double *thermo_psi, double *diag, double *source);
+
+    void solve();
+    void sync();
+
+// #if defined DEBUG_
+    void compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs,  
+        bool printFlag);
+// #endif 
+    void comparerhorAUf(const double *rhorAUf, const double *boundary_rhorAUf, bool printFlag);
+    void comparephiHbyA(const double *phiHbyA, const double *boundary_phiHbyA, bool printFlag);
+    void comparephi(const double *phi, const double *boundary_phi, bool printFlag);
+    void comparephiFlux(const double *flux, const double *boundary_flux, bool printFlag);
+    void comparep(const double *p, const double *boundary_p, bool printFlag);
+    void compareU(const double *U, const double *boundary_U, bool printFlag);
+    void comparedpdt(const double *dpdt, bool printFlag);
+};
diff --git a/src_gpu/dfpEqn.cu b/src_gpu/dfpEqn.cu
new file mode 100644
index 000000000..dc7038a6d
--- /dev/null
+++ b/src_gpu/dfpEqn.cu
@@ -0,0 +1,843 @@
+#include "dfpEqn.H"
+
+__global__ void fvc_interpolate_internal_multi_scalar_kernel(int num_surfaces, const int *lower_index, const int *upper_index,
+        const double *vf1, const double *vf2, const double *weight, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double vf3_owner = vf1[owner] * vf2[owner];
+    double vf3_neighbour = vf1[neighbor] * vf2[neighbor];
+
+    output[index] = (w * (vf3_owner - vf3_neighbour) + vf3_neighbour);
+}
+
+__global__ void fvc_interpolate_boundary_multi_scalar_kernel_unCouple(int num, int offset,
+        const double *boundary_vf1, const double *boundary_vf2, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+    double boundary_vf3 = boundary_vf1[start_index] * boundary_vf2[start_index];
+    output[start_index] = boundary_vf3;
+}
+
+__global__ void fvc_interpolate_boundary_multi_scalar_kernel_processor(int num, int offset,
+        const double *boundary_weight, const double *boundary_vf1, const double *boundary_vf2, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double neighbor_boundary_vf3 = boundary_vf1[neighbor_start_index] * boundary_vf2[neighbor_start_index];
+    double internal_boundary_vf3 = boundary_vf1[internal_start_index] * boundary_vf2[internal_start_index];
+    
+    double boundary_vf3 = (1 - bouWeight) * neighbor_boundary_vf3 + bouWeight * internal_boundary_vf3;
+    
+    output[neighbor_start_index] = boundary_vf3;
+}
+
+__global__ void get_phiCorr_internal_kernel(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index, const double *phi_old, 
+        const double *field_vector, const double *field_scalar, const double *weight, const double *face_vector,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    double vf_own_x = field_vector[num_cells * 0 + owner] * field_scalar[owner];
+    double vf_own_y = field_vector[num_cells * 1 + owner] * field_scalar[owner];
+    double vf_own_z = field_vector[num_cells * 2 + owner] * field_scalar[owner];
+
+    double vf_nei_x = field_vector[num_cells * 0 + neighbor] * field_scalar[neighbor];
+    double vf_nei_y = field_vector[num_cells * 1 + neighbor] * field_scalar[neighbor];
+    double vf_nei_z = field_vector[num_cells * 2 + neighbor] * field_scalar[neighbor];
+
+    double ssfx = (w * (vf_own_x - vf_nei_x) + vf_nei_x);
+    double ssfy = (w * (vf_own_y - vf_nei_y) + vf_nei_y);
+    double ssfz = (w * (vf_own_z - vf_nei_z) + vf_nei_z);
+
+    output[index] = phi_old[index] - (Sfx * ssfx + Sfy * ssfy + Sfz * ssfz);    
+}
+
+__global__ void get_phiCorr_boundary_kernel_zeroGradient(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_face_vector, const double *boundary_field_vector, 
+        const double *boundary_field_scalar, const double *boundary_phi_old, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    double boussfx = boundary_field_vector[num_boundary_surfaces * 0 + start_index] * boundary_field_scalar[start_index];
+    double boussfy = boundary_field_vector[num_boundary_surfaces * 1 + start_index] * boundary_field_scalar[start_index];
+    double boussfz = boundary_field_vector[num_boundary_surfaces * 2 + start_index] * boundary_field_scalar[start_index];
+
+    output[start_index] = boundary_phi_old[start_index] - (bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz);
+}
+
+__global__ void get_phiCorr_boundary_kernel_processor(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_face_vector, const double *boundary_field_vector, 
+        const double *boundary_field_scalar, const double *boundary_phi_old, 
+        const double *boundary_weight, double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+
+    double boussfxNeighbor = boundary_field_vector[num_boundary_surfaces * 0 + neighbor_start_index] 
+            * boundary_field_scalar[neighbor_start_index];
+    double boussfyNeighbor = boundary_field_vector[num_boundary_surfaces * 1 + neighbor_start_index] 
+            * boundary_field_scalar[neighbor_start_index];
+    double boussfzNeighbor = boundary_field_vector[num_boundary_surfaces * 2 + neighbor_start_index] 
+            * boundary_field_scalar[neighbor_start_index];
+    
+    double boussfxInternal = boundary_field_vector[num_boundary_surfaces * 0 + internal_start_index] 
+            * boundary_field_scalar[internal_start_index];
+    double boussfyInternal = boundary_field_vector[num_boundary_surfaces * 1 + internal_start_index] 
+            * boundary_field_scalar[internal_start_index];
+    double boussfzInternal = boundary_field_vector[num_boundary_surfaces * 2 + internal_start_index] 
+            * boundary_field_scalar[internal_start_index];
+    
+    double boussfx = (1 - bouWeight) * boussfxNeighbor + bouWeight * boussfxInternal;
+    double boussfy = (1 - bouWeight) * boussfyNeighbor + bouWeight * boussfyInternal;
+    double boussfz = (1 - bouWeight) * boussfzNeighbor + bouWeight * boussfzInternal;
+
+    output[neighbor_start_index] = boundary_phi_old[neighbor_start_index] - (bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz);
+}
+
+__global__ void get_ddtCorr_internal_kernel(int num_cells, int num_surfaces, 
+        const double *phiCorr, const double *phi, const double rDeltaT,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double phiCorrVal = phiCorr[index];
+    double phiVal = phi[index];
+
+    double tddtCouplingCoeff = 1. - min(fabs(phiCorrVal)/fabs(phiVal) + SMALL, 1.);
+    
+    output[index] = tddtCouplingCoeff * rDeltaT * phiCorrVal;
+}
+
+__global__ void get_ddtCorr_boundary_nonZero_kernel(int num_boundary_surfaces, int num, int offset,
+        const double *boundary_phiCorr, const double *boundary_phi, const double rDeltaT,
+        double *output)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+
+    double bouPhiCorrVal = boundary_phiCorr[start_index];
+    double bouPhiVal = boundary_phi[start_index];
+
+    double bou_tddtCouplingCoeff = 1. - min(fabs(bouPhiCorrVal)/fabs(bouPhiVal) + SMALL, 1.);
+    output[start_index] = bou_tddtCouplingCoeff * rDeltaT * bouPhiCorrVal;
+}
+
+__global__ void multi_fvc_flux_fvc_intepolate_internal_kernel(int num_cells, int num_surfaces, 
+        const int *lower_index, const int *upper_index,
+        const double *field_vector, const double *vf, const double *weight, const double *face_vector,
+        double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_surfaces)
+        return;
+    
+    double w = weight[index];
+    int owner = lower_index[index];
+    int neighbor = upper_index[index];
+
+    // fvc_flux_HbyA
+    double Sfx = face_vector[num_surfaces * 0 + index];
+    double Sfy = face_vector[num_surfaces * 1 + index];
+    double Sfz = face_vector[num_surfaces * 2 + index];
+
+    double ssfx = (w * (field_vector[num_cells * 0 + owner] - field_vector[num_cells * 0 + neighbor]) + field_vector[num_cells * 0 + neighbor]);
+    double ssfy = (w * (field_vector[num_cells * 1 + owner] - field_vector[num_cells * 1 + neighbor]) + field_vector[num_cells * 1 + neighbor]);
+    double ssfz = (w * (field_vector[num_cells * 2 + owner] - field_vector[num_cells * 2 + neighbor]) + field_vector[num_cells * 2 + neighbor]);
+
+    // fvc_interpolate_rho
+    double vf_interp = (w * (vf[owner] - vf[neighbor]) + vf[neighbor]);
+
+    output[index] += (Sfx * ssfx + Sfy * ssfy + Sfz * ssfz) * vf_interp;
+}
+
+__global__ void multi_fvc_flux_fvc_intepolate_boundary_kernel_zeroGradient(int num_boundary_surfaces, int num, int offset, 
+        const double *boundary_face_vector, const double *boundary_field_vector, 
+        const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int start_index = offset + index;
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + start_index];
+
+    double boussfx = boundary_field_vector[num_boundary_surfaces * 0 + start_index];
+    double boussfy = boundary_field_vector[num_boundary_surfaces * 1 + start_index];
+    double boussfz = boundary_field_vector[num_boundary_surfaces * 2 + start_index];
+
+    output[start_index] += (bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz) * boundary_vf[start_index];
+}
+
+__global__ void multi_fvc_flux_fvc_intepolate_boundary_kernel_processor(int num_boundary_surfaces, int num, int offset, 
+        const double *boundary_face_vector, const double *boundary_field_vector, const double *boundary_weight,
+        const double *boundary_vf, double *output, double sign)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num)
+        return;
+    
+    int neighbor_start_index = offset + index;
+    int internal_start_index = offset + num + index;
+
+    double bouWeight = boundary_weight[neighbor_start_index];
+
+    double bouSfx = boundary_face_vector[num_boundary_surfaces * 0 + neighbor_start_index];
+    double bouSfy = boundary_face_vector[num_boundary_surfaces * 1 + neighbor_start_index];
+    double bouSfz = boundary_face_vector[num_boundary_surfaces * 2 + neighbor_start_index];
+
+    // interpolate boundary vector
+    double boussfx = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 0 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 0 + internal_start_index];
+    double boussfy = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 1 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 1 + internal_start_index];
+    double boussfz = (1 - bouWeight) * boundary_field_vector[num_boundary_surfaces * 2 + neighbor_start_index] + 
+            bouWeight * boundary_field_vector[num_boundary_surfaces * 2 + internal_start_index];
+    
+    // interpolate boundary scalar
+    double bouvf = (1 - bouWeight) * boundary_vf[neighbor_start_index] + bouWeight * boundary_vf[internal_start_index];
+    
+    output[neighbor_start_index] += (bouSfx * boussfx + bouSfy * boussfy + bouSfz * boussfz) * bouvf;
+}
+
+__global__ void correct_diag_mtx_multi_tpsi_kernel(int num_cells, const double *psi, const double *thermo_psi, 
+        double *source, double *diag)
+{
+    int index = blockDim.x * blockIdx.x + threadIdx.x;
+    if (index >= num_cells)
+        return;
+
+    // correction: source += (-diag * psi + source)
+    double srcVal = source[index];
+    double APsi = - diag[index] * psi[index] + srcVal;
+    source[index] -= APsi;
+
+    // multi psi
+    double tPsiVal = thermo_psi[index];
+    source[index] *= tPsiVal;
+    diag[index] *= tPsiVal;
+}
+
+double* dfpEqn::getFieldPointer(const char* fieldAlias, location loc, position pos) {
+    char mergedName[256];
+    if (pos == position::internal) {
+        sprintf(mergedName, "%s_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    } else if (pos == position::boundary) {
+        sprintf(mergedName, "%s_boundary_%s", (loc == location::cpu) ? "h" : "d", fieldAlias);
+    }
+
+    double *pointer = nullptr;
+    if (fieldPointerMap.find(std::string(mergedName)) != fieldPointerMap.end()) {
+        pointer = fieldPointerMap[std::string(mergedName)];
+    }
+    if (pointer == nullptr) {
+        fprintf(stderr, "Warning! getFieldPointer of %s returns nullptr!\n", mergedName);
+    }
+    //fprintf(stderr, "fieldAlias: %s, mergedName: %s, pointer: %p\n", fieldAlias, mergedName, pointer);
+
+    return pointer;
+}
+
+void dfpEqn::setConstantValues(const std::string &mode_string, const std::string &setting_path) {
+    this->stream = dataBase_.stream;
+    this->mode_string = mode_string;
+    this->setting_path = setting_path;
+    pSolver = new AmgXSolver(mode_string, setting_path, dataBase_.localRank);
+}
+
+void dfpEqn::setConstantFields(const std::vector<int> patch_type_U, const std::vector<int> patch_type_p) {
+    this->patch_type_U = patch_type_U;
+    this->patch_type_p = patch_type_p;
+}
+
+void dfpEqn::createNonConstantFieldsInternal() {
+#ifndef STREAM_ALLOCATOR
+    // intermediate fields
+    checkCudaErrors(cudaMalloc((void**)&d_rhorAUf, dataBase_.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_phiHbyA, dataBase_.surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_flux, dataBase_.surface_value_bytes));
+#endif
+}
+
+void dfpEqn::createNonConstantFieldsBoundary() {
+#ifndef STREAM_ALLOCATOR
+    // boundary coeffs
+    checkCudaErrors(cudaMalloc((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    // intermediate boundary fields
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes));
+#endif
+}
+
+void dfpEqn::createNonConstantLduAndCsrFields() {
+    // ldu and csr
+    checkCudaErrors(cudaMalloc((void**)&d_ldu, dataBase_.csr_value_bytes));
+    d_lower = d_ldu;
+    d_diag = d_ldu + dataBase_.num_surfaces;
+    d_upper = d_ldu + dataBase_.num_cells + dataBase_.num_surfaces;
+    d_extern = d_ldu + dataBase_.num_cells + 2 * dataBase_.num_surfaces;
+#ifndef STREAM_ALLOCATOR
+    checkCudaErrors(cudaMalloc((void**)&d_source, dataBase_.cell_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes));
+    checkCudaErrors(cudaMalloc((void**)&d_A, dataBase_.csr_value_bytes));
+#endif
+}
+
+void dfpEqn::initNonConstantFields(const double *p, const double *boundary_p){
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, dataBase_.h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, dataBase_.h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+}
+
+void dfpEqn::cleanCudaResources() {
+#ifdef USE_GRAPH
+    if (pre_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_pre));
+        checkCudaErrors(cudaGraphDestroy(graph_pre));
+    }
+    if (post_graph_created) {
+        checkCudaErrors(cudaGraphExecDestroy(graph_instance_post));
+        checkCudaErrors(cudaGraphDestroy(graph_post));
+    }
+#endif
+}
+
+// tmp
+void dfpEqn::preProcess(double *h_phi, double *h_boundary_phi) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_phi, h_phi, dataBase_.surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_phi, h_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+};
+
+void dfpEqn::correctPsi(const double *h_thermoPsi, double *h_boundary_thermoPsi) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_thermo_psi, h_thermoPsi, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_thermo_psi, h_boundary_thermoPsi, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+};
+void dfpEqn::correctP(const double *h_p, double *h_boundary_p) {
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_p, h_p, dataBase_.cell_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+    checkCudaErrors(cudaMemcpyAsync(dataBase_.d_boundary_p, h_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyHostToDevice, dataBase_.stream));
+};
+
+void dfpEqn::process() {
+    TICK_INIT_EVENT;
+    TICK_START_EVENT;
+#ifdef USE_GRAPH
+    if(!pre_graph_created) {
+        DEBUG_TRACE;
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+
+#ifdef STREAM_ALLOCATOR
+    // intermediate fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_rhorAUf, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_phiHbyA, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_flux, dataBase_.surface_value_bytes, dataBase_.stream));
+
+    // boundary coeffs
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_value_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_gradient_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    // intermediate boundary fields
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_flux, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+
+    // ldu and csr
+    checkCudaErrors(cudaMallocAsync((void**)&d_source, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_internal_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMallocAsync((void**)&d_A, dataBase_.csr_value_bytes, dataBase_.stream));
+#endif
+
+    checkCudaErrors(cudaMemsetAsync(d_ldu, 0, dataBase_.csr_value_bytes, dataBase_.stream)); // d_ldu contains d_lower, d_diag, and d_upper
+    checkCudaErrors(cudaMemsetAsync(d_source, 0, dataBase_.cell_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_internal_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_boundary_coeffs, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_A, 0, dataBase_.csr_value_bytes, dataBase_.stream));
+
+    // intermediate parameters
+    checkCudaErrors(cudaMemsetAsync(d_rhorAUf, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_boundary_rhorAUf, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_phiHbyA, 0, dataBase_.surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_boundary_phiHbyA, 0, dataBase_.boundary_surface_value_bytes, dataBase_.stream));
+    checkCudaErrors(cudaMemsetAsync(d_flux, 0, dataBase_.surface_value_bytes, dataBase_.stream)); // TODO: introduce of flux is not necessary
+    
+    update_boundary_coeffs_scalar(dataBase_.stream,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
+            dataBase_.d_boundary_delta_coeffs, dataBase_.d_boundary_p, dataBase_.d_boundary_weight,
+            d_value_internal_coeffs, d_value_boundary_coeffs,
+            d_gradient_internal_coeffs, d_gradient_boundary_coeffs);
+    getrhorAUf(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, 
+            dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_weight, 
+            dataBase_.d_rho, dataBase_.d_rAU, d_rhorAUf, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_calculated.data(), dataBase_.d_boundary_weight,
+            dataBase_.d_boundary_rho, dataBase_.d_boundary_rAU, d_boundary_rhorAUf);
+    getphiHbyA(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, 
+            dataBase_.rdelta_t, dataBase_.d_owner, dataBase_.d_neighbor, 
+            dataBase_.d_weight, dataBase_.d_u_old, dataBase_.d_rho_old,
+            dataBase_.d_phi_old, dataBase_.d_rho, d_rhorAUf, dataBase_.d_HbyA, dataBase_.d_sf, d_phiHbyA, // end for internal
+            dataBase_.num_patches, dataBase_.patch_size.data(), dataBase_.patch_type_extropolated.data(),
+            dataBase_.d_boundary_sf, dataBase_.d_boundary_u_old, dataBase_.d_boundary_rho, 
+            dataBase_.d_boundary_rho_old, dataBase_.d_boundary_phi_old, d_boundary_rhorAUf, dataBase_.d_boundary_HbyA, 
+            dataBase_.d_boundary_weight, d_boundary_phiHbyA, 1.0);
+    fvm_ddt_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p_old, dataBase_.d_volume, d_diag, d_source);
+    correctionDiagMtxMultiTPsi(dataBase_.stream, dataBase_.num_cells, dataBase_.d_p, dataBase_.d_thermo_psi, d_diag, d_source);
+    fvc_ddt_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_rho, dataBase_.d_rho_old, dataBase_.d_volume, 
+            d_source, -1.);
+    fvc_div_surface_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_owner, dataBase_.d_neighbor, d_phiHbyA, dataBase_.d_boundary_face_cell, 
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
+            d_boundary_phiHbyA, dataBase_.d_volume, d_source, -1.);
+    fvm_laplacian_surface_scalar_vol_scalar(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.d_owner, dataBase_.d_neighbor, dataBase_.d_mag_sf, dataBase_.d_delta_coeffs, d_rhorAUf, 
+            d_lower, d_upper, d_diag, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
+            dataBase_.d_boundary_mag_sf, d_boundary_rhorAUf, d_gradient_internal_coeffs, d_gradient_boundary_coeffs, 
+            d_internal_coeffs, d_boundary_coeffs, -1.);
+    
+    // solve
+    ldu_to_csr_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+            dataBase_.num_Nz, dataBase_.d_boundary_face_cell, dataBase_.d_ldu_to_csr_index,
+            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(),
+            d_ldu, d_source, d_internal_coeffs, d_boundary_coeffs, d_A);
+
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_pre));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_pre, graph_pre, NULL, NULL, 0));
+        pre_graph_created = true;
+    }
+    DEBUG_TRACE;
+    checkCudaErrors(cudaGraphLaunch(graph_instance_pre, dataBase_.stream));
+#endif
+    TICK_END_EVENT(pEqn assembly);
+
+    TICK_START_EVENT;
+    solve();
+    TICK_END_EVENT(pEqn solve);
+
+#ifdef USE_GRAPH
+    if(!post_graph_created) {
+        checkCudaErrors(cudaStreamBeginCapture(dataBase_.stream, cudaStreamCaptureModeGlobal));
+#endif
+    
+        TICK_START_EVENT;
+        correct_boundary_conditions_scalar(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(),
+                dataBase_.num_boundary_surfaces, dataBase_.num_patches, dataBase_.patch_size.data(), 
+                patch_type_p.data(), dataBase_.d_boundary_delta_coeffs,
+                dataBase_.d_boundary_face_cell, dataBase_.d_p, dataBase_.d_boundary_p,
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_weight);
+        // update phi
+        fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+                d_lower, d_upper, dataBase_.d_p, d_flux,
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+                dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.cyclicNeighbor.data(), 
+                dataBase_.patchSizeOffset.data(), dataBase_.d_boundary_p, d_boundary_flux);
+        field_add_scalar(dataBase_.stream, dataBase_.num_surfaces, d_phiHbyA, d_flux, dataBase_.d_phi, 
+                dataBase_.num_boundary_surfaces, d_boundary_phiHbyA, d_boundary_flux, dataBase_.d_boundary_phi);
+        // correct U
+        checkCudaErrors(cudaMemsetAsync(dataBase_.d_u, 0., dataBase_.cell_value_vec_bytes, dataBase_.stream));
+        // TODO: may do not need to calculate boundary fields
+        fvc_grad_cell_scalar(dataBase_.stream, dataBase_.num_cells, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces,
+                dataBase_.d_owner, dataBase_.d_neighbor, 
+                dataBase_.d_weight, dataBase_.d_sf, dataBase_.d_p, dataBase_.d_u, 
+                dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_boundary_p, dataBase_.d_boundary_sf, dataBase_.d_volume, true);
+        scalar_field_multiply_vector_field(dataBase_.stream, dataBase_.num_cells, dataBase_.d_rAU, dataBase_.d_u, dataBase_.d_u);
+        field_add_vector(dataBase_.stream, dataBase_.num_cells, dataBase_.d_HbyA, dataBase_.d_u, dataBase_.d_u, -1.);
+        correct_boundary_conditions_vector(dataBase_.stream, dataBase_.nccl_comm, dataBase_.neighbProcNo.data(), dataBase_.num_boundary_surfaces, 
+                dataBase_.num_cells, dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_U.data(), dataBase_.d_boundary_weight,
+                dataBase_.d_boundary_face_cell, dataBase_.d_u, dataBase_.d_boundary_u, 
+                dataBase_.cyclicNeighbor.data(), dataBase_.patchSizeOffset.data());
+        vector_half_mag_square(dataBase_.stream, dataBase_.num_cells, dataBase_.d_u, dataBase_.d_k, dataBase_.num_boundary_surfaces, 
+                dataBase_.d_boundary_u, dataBase_.d_boundary_k);
+        // calculate dpdt
+        fvc_ddt_scalar_field(dataBase_.stream, dataBase_.num_cells, dataBase_.rdelta_t, dataBase_.d_p, dataBase_.d_p_old, dataBase_.d_volume, dataBase_.d_dpdt, 1.);
+
+#ifdef STREAM_ALLOCATOR
+        // intermediate fields
+        checkCudaErrors(cudaFreeAsync(d_rhorAUf, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_phiHbyA, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_flux, dataBase_.stream));
+
+        // boundary coeffs
+        checkCudaErrors(cudaFreeAsync(d_value_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_value_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_gradient_boundary_coeffs, dataBase_.stream));
+        // intermediate boundary fields
+        checkCudaErrors(cudaFreeAsync(d_boundary_rhorAUf, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_phiHbyA, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_flux, dataBase_.stream));
+
+        // ldu and csr
+        checkCudaErrors(cudaFreeAsync(d_source, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_internal_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_boundary_coeffs, dataBase_.stream));
+        checkCudaErrors(cudaFreeAsync(d_A, dataBase_.stream));
+#endif
+        TICK_END_EVENT(pEqn post process all);
+
+#ifdef USE_GRAPH
+        checkCudaErrors(cudaStreamEndCapture(dataBase_.stream, &graph_post));
+        checkCudaErrors(cudaGraphInstantiate(&graph_instance_post, graph_post, NULL, NULL, 0));
+        post_graph_created = true;
+    }
+    checkCudaErrors(cudaGraphLaunch(graph_instance_post, dataBase_.stream));
+#endif
+    sync();
+}
+void dfpEqn::postProcess() {}
+
+//void dfpEqn::getFlux()
+//{
+//    fvMtx_flux(dataBase_.stream, dataBase_.num_surfaces, dataBase_.num_boundary_surfaces, dataBase_.d_owner, dataBase_.d_neighbor, 
+//            d_lower, d_upper, dataBase_.d_p, d_flux,
+//            dataBase_.num_patches, dataBase_.patch_size.data(), patch_type_p.data(), 
+//            dataBase_.d_boundary_face_cell, d_internal_coeffs, d_boundary_coeffs, dataBase_.d_boundary_p, d_boundary_flux);
+//    sync();
+//}
+
+void dfpEqn::getrhorAUf(cudaStream_t stream, int num_cells, int num_surfaces,
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *vf1, const double *vf2, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type, const double *boundary_weight,
+        const double *boundary_vf1, const double *boundary_vf2, double *boundary_output, double sign) 
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    fvc_interpolate_internal_multi_scalar_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_surfaces,
+            lowerAddr, upperAddr, vf1, vf2, weight, output, sign);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        // TODO: maybe do not need loop boundarys
+        if (patch_type[i] == boundaryConditions::zeroGradient
+                || patch_type[i] == boundaryConditions::fixedValue
+                || patch_type[i] == boundaryConditions::calculated
+                || patch_type[i] == boundaryConditions::cyclic) {
+            fvc_interpolate_boundary_multi_scalar_kernel_unCouple<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_vf1, boundary_vf2, boundary_output, sign);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            fvc_interpolate_boundary_multi_scalar_kernel_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(patch_size[i], offset,
+                    boundary_weight, boundary_vf1, boundary_vf2, boundary_output, sign);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+};
+
+void dfpEqn::getphiHbyA(cudaStream_t stream, int num_cells, int num_surfaces, int num_boundary_surfaces, double rDeltaT, 
+        const int *lowerAddr, const int *upperAddr, 
+        const double *weight, const double *u_old, const double *rho_old, const double *phi_old, const double *rho, 
+        const double *rhorAUf, const double *HbyA, const double *Sf, double *output, // end for internal
+        int num_patches, const int *patch_size, const int *patch_type,
+        const double *boundary_Sf, const double *boundary_velocity_old, const double *boundary_rho, 
+        const double *boundary_rho_old, const double *boundary_phi_old, const double *boundary_rhorAUf, const double *boundary_HbyA,
+        const double *boundary_weight, double *boundary_output, double sign)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    get_phiCorr_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
+            phi_old, u_old, rho_old, weight, Sf, output);
+    
+    int offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            get_phiCorr_boundary_kernel_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_Sf, boundary_velocity_old, boundary_rho_old, boundary_phi_old, boundary_weight, boundary_output);
+            offset += 2 * patch_size[i];
+        } else {
+            get_phiCorr_boundary_kernel_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset,
+                    boundary_Sf, boundary_velocity_old, boundary_rho_old, boundary_phi_old, boundary_output);
+            offset += patch_size[i];
+        }
+    }
+
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    get_ddtCorr_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, output, phi_old, rDeltaT, output);
+
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::processor
+            || patch_type[i] == boundaryConditions::processorCyclic) {
+            get_ddtCorr_boundary_nonZero_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_output, boundary_phi_old, rDeltaT, boundary_output);
+            offset += 2 * patch_size[i];
+            continue;
+        }
+        offset += patch_size[i];
+    }
+
+    field_multiply_scalar(stream, num_surfaces, output, rhorAUf, output, num_boundary_surfaces, boundary_output, boundary_rhorAUf, boundary_output);
+
+    blocks_per_grid = (num_surfaces + threads_per_block - 1) / threads_per_block;
+    multi_fvc_flux_fvc_intepolate_internal_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, num_surfaces, lowerAddr, upperAddr, 
+            HbyA, rho, weight, Sf, output, sign);
+    
+    offset = 0;
+    for (int i = 0; i < num_patches; i++) {
+        if (patch_size[i] == 0) continue;
+        threads_per_block = 256;
+        blocks_per_grid = (patch_size[i] + threads_per_block - 1) / threads_per_block;
+        if (patch_type[i] == boundaryConditions::extrapolated
+            || patch_type[i] == boundaryConditions::cyclic) {
+            multi_fvc_flux_fvc_intepolate_boundary_kernel_zeroGradient<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_Sf, boundary_HbyA, boundary_rho, boundary_output, sign);
+        } else if (patch_type[i] == boundaryConditions::processor
+                    || patch_type[i] == boundaryConditions::processorCyclic) {
+            multi_fvc_flux_fvc_intepolate_boundary_kernel_processor<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_boundary_surfaces, patch_size[i], offset, 
+                    boundary_Sf, boundary_HbyA, boundary_weight, boundary_rho, boundary_output, sign);
+            offset += 2 * patch_size[i];
+            continue;
+        } else {
+            fprintf(stderr, "%s %d, boundaryConditions other than zeroGradient are not support yet!\n", __FILE__, __LINE__);
+        }
+        offset += patch_size[i];
+    }
+}
+
+void dfpEqn::correctionDiagMtxMultiTPsi(cudaStream_t stream, int num_cells, const double *psi, const double *thermo_psi, double *diag, double *source)
+{
+    size_t threads_per_block = 1024;
+    size_t blocks_per_grid = (num_cells + threads_per_block - 1) / threads_per_block;
+    correct_diag_mtx_multi_tpsi_kernel<<<blocks_per_grid, threads_per_block, 0, stream>>>(num_cells, psi, thermo_psi, source, diag);
+}
+
+void dfpEqn::sync()
+{
+    checkCudaErrors(cudaStreamSynchronize(dataBase_.stream));
+}
+
+void dfpEqn::solve()
+{
+    dataBase_.solve(num_iteration, AMGXSetting::p_setting, d_A, dataBase_.d_p, d_source);
+    num_iteration++;
+}
+
+// debug
+void dfpEqn::comparerhorAUf(const double *rhorAUf, const double *boundary_rhorAUf, bool printFlag)
+{
+    double *h_rhorAUf = new double[dataBase_.num_surfaces];
+    double *h_boundary_rhorAUf = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_rhorAUf, d_rhorAUf, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_rhorAUf, d_boundary_rhorAUf, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_rhorAUf\n");
+    checkVectorEqual(dataBase_.num_surfaces, rhorAUf, h_rhorAUf, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_rhorAUf\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_rhorAUf, h_boundary_rhorAUf, 1e-10, printFlag);
+}
+
+void dfpEqn::comparephiHbyA(const double *phiHbyA, const double *boundary_phiHbyA, bool printFlag)
+{
+    double *h_phiHbyA = new double[dataBase_.num_surfaces];
+    double *h_boundary_phiHbyA = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_phiHbyA, d_phiHbyA, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_phiHbyA, d_boundary_phiHbyA, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_phiHbyA\n");
+    checkVectorEqual(dataBase_.num_surfaces, phiHbyA, h_phiHbyA, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_phiHbyA\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_phiHbyA, h_boundary_phiHbyA, 1e-10, printFlag);
+}
+
+void dfpEqn::comparephi(const double *phi, const double *boundary_phi, bool printFlag)
+{
+    double *h_phi = new double[dataBase_.num_surfaces];
+    double *h_boundary_phi = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_phi, dataBase_.d_phi, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_phi, dataBase_.d_boundary_phi, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_phi\n");
+    checkVectorEqual(dataBase_.num_surfaces, phi, h_phi, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_phi\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_phi, h_boundary_phi, 1e-10, printFlag);
+}
+
+void dfpEqn::comparephiFlux(const double *flux, const double *boundary_flux, bool printFlag)
+{
+    double *h_flux = new double[dataBase_.num_surfaces];
+    double *h_boundary_flux = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_flux, d_flux, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_flux, d_boundary_flux, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_flux\n");
+    checkVectorEqual(dataBase_.num_surfaces, flux, h_flux, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_flux\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_flux, h_boundary_flux, 1e-10, printFlag);
+}
+
+void dfpEqn::comparep(const double *p, const double *boundary_p, bool printFlag)
+{
+    double *h_p = new double[dataBase_.num_cells];
+    double *h_boundary_p = new double[dataBase_.num_boundary_surfaces];
+
+    checkCudaErrors(cudaMemcpy(h_p, dataBase_.d_p, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_p, dataBase_.d_boundary_p, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+
+    fprintf(stderr, "check h_p\n");
+    checkVectorEqual(dataBase_.num_cells, p, h_p, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_p\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_p, h_boundary_p, 1e-10, printFlag);
+}
+
+void dfpEqn::compareU(const double *U, const double *boundary_U, bool printFlag)
+{
+    double *h_u = new double[dataBase_.num_cells * 3];
+    double *h_u_ref = new double[dataBase_.num_cells * 3];
+    double *h_boundary_u = new double[dataBase_.num_boundary_surfaces * 3];
+    double *h_boundary_u_ref = new double[dataBase_.num_boundary_surfaces * 3];
+
+    // permute
+    for (int i = 0; i < dataBase_.num_cells; i++)
+    {
+        h_u_ref[dataBase_.num_cells * 0 + i] = U[i * 3 + 0];
+        h_u_ref[dataBase_.num_cells * 1 + i] = U[i * 3 + 1];
+        h_u_ref[dataBase_.num_cells * 2 + i] = U[i * 3 + 2];
+    }
+    for (int i = 0; i < dataBase_.num_boundary_surfaces; i++)
+    {
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 0 + i] = boundary_U[i * 3 + 0];
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 1 + i] = boundary_U[i * 3 + 1];
+        h_boundary_u_ref[dataBase_.num_boundary_surfaces * 2 + i] = boundary_U[i * 3 + 2];
+    }
+    checkCudaErrors(cudaMemcpy(h_u, dataBase_.d_u, dataBase_.cell_value_vec_bytes, cudaMemcpyDeviceToHost));
+    checkCudaErrors(cudaMemcpy(h_boundary_u, dataBase_.d_boundary_u, dataBase_.boundary_surface_value_vec_bytes, cudaMemcpyDeviceToHost));
+
+    // check result
+    fprintf(stderr, "check h_u\n");
+    checkVectorEqual(dataBase_.num_cells * 3, h_u_ref, h_u, 1e-10, printFlag);
+    fprintf(stderr, "check h_boundary_u\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces * 3, h_boundary_u_ref, h_boundary_u, 1e-10, printFlag);
+}
+
+void dfpEqn::comparedpdt(const double *dpdt, bool printFlag)
+{
+    double *h_dpdt = new double[dataBase_.num_cells];
+    checkCudaErrors(cudaMemcpy(h_dpdt, dataBase_.d_dpdt, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_dpdt\n");
+    checkVectorEqual(dataBase_.num_cells, dpdt, h_dpdt, 1e-10, printFlag);
+}
+
+void dfpEqn::compareResult(const double *lower, const double *upper, const double *diag, const double *source, const double *internal_coeffs, const double *boundary_coeffs,  
+        bool printFlag)
+{
+    DEBUG_TRACE;
+    std::vector<double> h_lower;
+    h_lower.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_lower.data(), d_lower, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_lower\n");
+    checkVectorEqual(dataBase_.num_surfaces, lower, h_lower.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_upper;
+    h_upper.resize(dataBase_.num_surfaces);
+    checkCudaErrors(cudaMemcpy(h_upper.data(), d_upper, dataBase_.surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_upper\n");
+    checkVectorEqual(dataBase_.num_surfaces, upper, h_upper.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_diag;
+    h_diag.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_diag.data(), d_diag, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_diag\n");
+    checkVectorEqual(dataBase_.num_cells, diag, h_diag.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_source;
+    h_source.resize(dataBase_.num_cells);
+    checkCudaErrors(cudaMemcpy(h_source.data(), d_source, dataBase_.cell_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_source\n");
+    checkVectorEqual(dataBase_.num_cells, source, h_source.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_internal_coeffs;
+    h_internal_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_internal_coeffs.data(), d_internal_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_internal_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, internal_coeffs, h_internal_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+
+    std::vector<double> h_boundary_coeffs;
+    h_boundary_coeffs.resize(dataBase_.num_boundary_surfaces);
+    checkCudaErrors(cudaMemcpy(h_boundary_coeffs.data(), d_boundary_coeffs, dataBase_.boundary_surface_value_bytes, cudaMemcpyDeviceToHost));
+    fprintf(stderr, "check h_boundary_coeffs\n");
+    checkVectorEqual(dataBase_.num_boundary_surfaces, boundary_coeffs, h_boundary_coeffs.data(), 1e-14, printFlag);
+    DEBUG_TRACE;
+}