deepmodeling · maorz1998 · Dec 24, 2023 · Aug 1, 2023 · Aug 2, 2023 · Aug 3, 2023
diff --git a/applications/solvers/dfLowMachFoam/EEqn.H b/applications/solvers/dfLowMachFoam/EEqn.H
@@ -1,36 +1,137 @@
 {
     volScalarField& he = thermo.he();
-
+#if defined GPUSolverNew_
+    double *h_he = dfDataBase.getFieldPointer("he", location::cpu, position::internal);
+    double *h_boundary_he = dfDataBase.getFieldPointer("he", location::cpu, position::boundary);
+
+    EEqn_GPU.process();
+    EEqn_GPU.sync();
+    // EEqn_GPU.postProcess(h_he, h_boundary_he);
+
+    // copy h_he to he(cpu)
+    // memcpy(&he[0], h_he, dfDataBase.cell_value_bytes);
+
+    //DEBUG_TRACE;
+    //he.correctBoundaryConditions();
+    //DEBUG_TRACE;
+
+#if defined DEBUG_
+    fvScalarMatrix EEqn
+    (
+
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
+            (
+                fvm::laplacian(turbulence->alphaEff(), he)
+            )
+        )
+    );
+    // EEqn.relax();
+    EEqn.solve("ha");
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces);
+
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        const double* internal_coeff_ptr = &EEqn.internalCoeffs()[patchi][0];
+        const double* boundary_coeff_ptr = &EEqn.boundaryCoeffs()[patchi][0];
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memset(h_internal_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            memset(h_boundary_coeffs.data() + offset + patchSize, 0, patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_internal_coeffs.data() + offset, internal_coeff_ptr, patchSize * sizeof(double));
+            memcpy(h_boundary_coeffs.data() + offset, boundary_coeff_ptr, patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    double *h_boundary_he_tmp = new double[dfDataBase.num_boundary_surfaces];
+    offset = 0;
+    forAll(he.boundaryField(), patchi)
+    {
+        const fvPatchScalarField& patchHe = he.boundaryField()[patchi];
+        int patchSize = patchHe.size();
+        if (patchHe.type() == "processor"
+            || patchHe.type() == "processorCyclic") {
+            const scalarField& patchHeInternal = dynamic_cast<const processorFvPatchField<scalar>&>(patchHe).patchInternalField()();
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            memcpy(h_boundary_he_tmp + offset + patchSize, &patchHeInternal[0], patchSize * sizeof(double));
+            offset += patchSize * 2;
+        } else {
+            memcpy(h_boundary_he_tmp + offset, &patchHe[0], patchSize * sizeof(double));
+            offset += patchSize;
+        }
+    }
+
+    bool printFlag = false;
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+    if (!mpi_init_flag || rank == 0) {
+		// DEBUG_TRACE;
+		// EEqn_GPU.compareResult(&EEqn.lower()[0], &EEqn.upper()[0], &EEqn.diag()[0], &EEqn.source()[0],
+		// 		h_internal_coeffs.data(), h_boundary_coeffs.data(), printFlag);
+		// DEBUG_TRACE;
+        // EEqn_GPU.compareHe(&he[0], h_boundary_he_tmp, printFlag);
+    }
+
+    delete h_boundary_he_tmp;
+
+#endif
+
+#else
     start1 = std::clock();
     fvScalarMatrix EEqn
     (
 
-            fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
-        +   fvc::ddt(rho, K) + fvc::div(phi, K)
-        -   dpdt
-        ==
+        fvm::ddt(rho, he) + mvConvection->fvmDiv(phi, he)
+    +   fvc::ddt(rho, K) + fvc::div(phi, K)
+    -   dpdt
+    ==
+        (
+            turbName == "laminar"
+            ?
+            (
+                fvm::laplacian(turbulence->alpha(), he)
+            -   diffAlphaD
+            +   fvc::div(hDiffCorrFlux)
+            )
+            :
             (
-                turbName == "laminar"
-                ?
-                (
-                    fvm::laplacian(turbulence->alpha(), he)
-                -   diffAlphaD
-                +   fvc::div(hDiffCorrFlux)
-                )
-                :
-                (
-                    fvm::laplacian(turbulence->alphaEff(), he)
-                )
+                fvm::laplacian(turbulence->alphaEff(), he)
             )
-        );
+        )
+    );
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
-    EEqn.relax();
+    // EEqn.relax();
     start1 = std::clock();
     EEqn.solve("ha");
     end1 = std::clock();
     time_monitor_EEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
     time_monitor_EEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif
 }
diff --git a/applications/solvers/dfLowMachFoam/Make/options b/applications/solvers/dfLowMachFoam/Make/options
@@ -9,7 +9,6 @@ EXE_INC = -std=c++14 \
     $(PFLAGS) $(PINC) \
     $(if $(LIBTORCH_ROOT),-DUSE_LIBTORCH,) \
     $(if $(PYTHON_INC_DIR),-DUSE_PYTORCH,) \
-    $(if $(AMGX_DIR),-DGPUSolver_,) \
     -I$(LIB_SRC)/transportModels/compressible/lnInclude \
     -I$(LIB_SRC)/thermophysicalModels/basic/lnInclude \
     -I$(LIB_SRC)/TurbulenceModels/turbulenceModels/lnInclude \
@@ -28,7 +27,7 @@ EXE_INC = -std=c++14 \
     $(if $(LIBTORCH_ROOT),-I$(LIBTORCH_ROOT)/include/torch/csrc/api/include,) \
     $(PYTHON_INC_DIR) \
     $(if $(AMGX_DIR), -I$(DF_ROOT)/src_gpu,) \
-    $(if $(AMGX_DIR), -I/usr/local/cuda-11.6/include,) \
+    $(if $(AMGX_DIR), -I/usr/local/cuda/include,) \
     $(if $(AMGX_DIR), -I$(AMGX_DIR)/include,)
 
 EXE_LIBS = \
@@ -50,6 +49,7 @@ EXE_LIBS = \
     $(if $(LIBTORCH_ROOT),-lpthread,) \
     $(if $(LIBTORCH_ROOT),$(DF_SRC)/dfChemistryModel/DNNInferencer/build/libDNNInferencer.so,) \
     $(if $(PYTHON_LIB_DIR),$(PYTHON_LIB_DIR),) \
-    $(if $(AMGX_DIR), /usr/local/cuda-11.6/lib64/libcudart.so,) \
+    $(if $(AMGX_DIR), /usr/local/cuda/lib64/libcudart.so,) \
+	$(if $(AMGX_DIR), /usr/local/cuda/lib64/libnccl.so,) \
     $(if $(AMGX_DIR), $(DF_ROOT)/src_gpu/build/libdfMatrix.so,) \
-    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
+    $(if $(AMGX_DIR), $(AMGX_DIR)/build/libamgxsh.so,)
diff --git a/applications/solvers/dfLowMachFoam/UEqn.H b/applications/solvers/dfLowMachFoam/UEqn.H
@@ -1,24 +1,148 @@
-start1 = std::clock();
-tmp<fvVectorMatrix> tUEqn
-(
-    fvm::ddt(rho, U) + fvm::div(phi, U)
-  + turbulence->divDevRhoReff(U) 
-);
-fvVectorMatrix& UEqn = tUEqn.ref();
-
-end1 = std::clock();
-time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
-
-UEqn.relax();
-start1 = std::clock();
-if (pimple.momentumPredictor())
-{
-    solve(UEqn == -fvc::grad(p));
+// Solve the Momentum equation
+#ifdef GPUSolverNew_
+
+#if defined DEBUG_
+    // run CPU, for temp
+    TICK_START;
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) 
+        + 
+        fvm::div(phi, U)
+        +  
+        turbulence->divDevRhoReff(U)
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    TICK_STOP(CPU assembly time);
+
+    volTensorField gradU = fvc::grad(U);
+
+    double *h_boundary_gradU = new double[dfDataBase.num_boundary_surfaces * 9];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchTensorField& patchGradU = gradU.boundaryField()[patchi];
+        int patchsize = patchGradU.size();
+        if (patchGradU.type() == "processor"
+                || patchGradU.type() == "processorCyclic") {
+            // print info
+            if (dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).doTransform()) {
+                Info << "gradU transform = true" << endl;
+            } else {
+                Info << "gradU transform = false" << endl;
+            }
+            Info << "rank = " << dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).rank() << endl;
+
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            tensorField patchGradUInternal = 
+                    dynamic_cast<const processorFvPatchField<tensor>&>(patchGradU).patchInternalField()();
+            memcpy(h_boundary_gradU + 9*offset + patchsize * 9, &patchGradUInternal[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize * 2;
+        } else {
+            memcpy(h_boundary_gradU + 9*offset, &patchGradU[0][0], patchsize * 9 * sizeof(double));
+            offset += patchsize;
+        }
+    }
+#endif
+
+    // process
+    TICK_START;
+    UEqn_GPU.process();
+    UEqn_GPU.sync();
+    TICK_STOP(GPU process time);
+
+    // postProcess
+    // TICK_START;
+    // UEqn_GPU.postProcess(h_u);
+    // memcpy(&U[0][0], h_u, dfDataBase.cell_value_vec_bytes);
+    // U.correctBoundaryConditions();
+    // K = 0.5*magSqr(U);
+    // DEBUG_TRACE;
+    // TICK_STOP(post process time);
 
+#if defined DEBUG_
+    // UEqn.relax();
+    TICK_START;
+    solve(UEqn == -fvc::grad(p));
+    K.oldTime();
     K = 0.5*magSqr(U);
-}
-end1 = std::clock();
-time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
-time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    TICK_STOP(CPU solve time);
+    // checkResult
+    // TODO: for temp, now we compare ldu, finally we compare csr
+    std::vector<double> h_internal_coeffs(dfDataBase.num_boundary_surfaces * 3);
+    std::vector<double> h_boundary_coeffs(dfDataBase.num_boundary_surfaces * 3);
+
+    offset = 0;
+    for (int patchi = 0; patchi < dfDataBase.num_patches; patchi++)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+        const double* internal_coeff_ptr = &UEqn.internalCoeffs()[patchi][0][0];
+        const double* boundary_coeff_ptr = &UEqn.boundaryCoeffs()[patchi][0][0];
+        memcpy(h_internal_coeffs.data() + offset * 3, internal_coeff_ptr, patchsize * 3 * sizeof(double));
+        memcpy(h_boundary_coeffs.data() + offset * 3, boundary_coeff_ptr, patchsize * 3 * sizeof(double));
+        if (patchU.type() == "processor" || patchU.type() == "processorCyclic") offset += 2 * patchsize;
+        else offset += patchsize;
+    }
+
+    double *h_boundary_u_tmp = new double[dfDataBase.num_boundary_surfaces * 3];
+    offset = 0;
+    forAll(U.boundaryField(), patchi)
+    {
+        const fvPatchVectorField& patchU = U.boundaryField()[patchi];
+        int patchsize = dfDataBase.patch_size[patchi];
+
+        if (patchU.type() == "processor"
+            || patchU.type() == "processorCyclic") {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            vectorField patchUInternal = 
+                    dynamic_cast<const processorFvPatchField<vector>&>(patchU).patchInternalField()();
+            memcpy(h_boundary_u_tmp + 3*offset + 3*patchsize, &patchUInternal[0][0], 3*patchsize * sizeof(double));
+            offset += 2 * patchsize;
+        } else {
+            memcpy(h_boundary_u_tmp + 3*offset, &patchU[0][0], 3*patchsize * sizeof(double));
+            offset += patchsize;
+        }
+    }
+
+    bool printFlag = false;
+
+    int rank = -1;
+    if (mpi_init_flag) {
+        MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    }
+
+    if (!mpi_init_flag || rank == 0) {
+        // UEqn_GPU.compareResult(&UEqn.lower()[0], &UEqn.upper()[0], &UEqn.diag()[0], &UEqn.source()[0][0],
+        //     h_internal_coeffs.data(), h_boundary_coeffs.data(), 
+        //     // &gradU[0][0], h_boundary_gradU,
+        //     printFlag);
+        // UEqn_GPU.compareU(&U[0][0], h_boundary_u_tmp, printFlag);
+    }
+    DEBUG_TRACE;
+#endif
+
+#else
+    start1 = std::clock();
+    tmp<fvVectorMatrix> tUEqn
+    (
+        fvm::ddt(rho, U) + fvm::div(phi, U)
+    + turbulence->divDevRhoReff(U) 
+    );
+    fvVectorMatrix& UEqn = tUEqn.ref();
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_mtxAssembly += double(end1 - start1) / double(CLOCKS_PER_SEC);
 
+    // UEqn.relax();
+    start1 = std::clock();
+    if (pimple.momentumPredictor())
+    {
+        solve(UEqn == -fvc::grad(p));
+        K.oldTime();
+        K = 0.5*magSqr(U);
+    }
+    end1 = std::clock();
+    time_monitor_UEqn += double(end1 - start1) / double(CLOCKS_PER_SEC);
+    time_monitor_UEqn_solve += double(end1 - start1) / double(CLOCKS_PER_SEC);
+#endif