started separating the gpu backend operations

vttresearch · May 28, 2024 · f426525 · f426525
1 parent 95d0f05
commit f426525
Show file tree

Hide file tree

Showing 12 changed files with 128 additions and 139 deletions.
diff --git a/gpu_chemistry/src/gpuKernelEvaluator/Make/options b/gpu_chemistry/src/gpuKernelEvaluator/Make/options
@@ -7,4 +7,5 @@ EXE_INC = \
 #LIB_LIBS = -lcudart_static -lcudart
 LIB_LIBS += -L$(CUDA_LIBS) -lcudart
 
-include ../../hipcc
+#include ../../hipcc
+include ../../nvcc
diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu
@@ -4,7 +4,7 @@
 
 #include "cuda_host_dev.H"
 
-#include "error_handling.H"
+#include "for_each_index.H"
 #include "host_device_vectors.H"
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h> //min_element
@@ -30,36 +30,10 @@ GpuKernelEvaluator::GpuKernelEvaluator(
     , solver_(make_gpuODESolver(system_, odeInputs))
     , inputs_(odeInputs)
     , memory_(nCells, nSpecie) {
-    /*
-    int num;
-    CHECK_CUDA_ERROR(cudaGetDeviceCount(&num)); // number of CUDA
-    devices
-
-    int dev = (nCells % num);
-    //cudaDeviceProp::canMapHostMemory prop;
-    //CHECK_CUDA_ERROR(cudaChooseDevice(&dev, &prop));
-
-
-    CHECK_CUDA_ERROR(cudaSetDevice(dev));
-    std::cout << "Using device: " << dev << std::endl;
-    */
-
-    /*
-    for (int i = 0; i < num; i++) {
-        // Query the device properties.
-        cudaDeviceProp prop;
-        cudaGetDeviceProperties(&prop, i);
-        std::cout << "Device id: " << i << std::endl;
-        std::cout << "Device name: " << prop.name << std::endl;
-    }
-    */
+
 }
 
-__global__ void cuda_kernel(gLabel nCells, singleCellSolver op) {
 
-    int celli = blockIdx.x * blockDim.x + threadIdx.x;
-    if (celli < nCells) { op(celli); }
-}
 /*
 static inline auto parseTimes(const char*                   label,
                               const std::vector<gpuBuffer>& b) {
@@ -115,33 +89,18 @@ GpuKernelEvaluator::computeYNew(
     singleCellSolver op(
         deltaT, nSpecie_, ddeltaTChem, dYvf, buffer_span, solver_);
 
+    for_each_index(op, nCells);
+
+
+    /*
     gLabel NTHREADS = 32;
     gLabel NBLOCKS  = (nCells + NTHREADS - 1) / NTHREADS;
     cuda_kernel<<<NBLOCKS, NTHREADS>>>(nCells, op);
 
     CHECK_LAST_CUDA_ERROR();
-    CHECK_CUDA_ERROR(cudaDeviceSynchronize());
-
-    ////
-    /*
-        auto bhost = toStdVector(buffers);
-
-        parseTimes("adaptive", bhost);
-        parseTimes("Jacobian", bhost);
-        parseTimes("step1", bhost);
-        parseTimes("step2", bhost);
-        parseTimes("step3", bhost);
-        
-    */
-
-    ////
-
-    /*
-    thrust::for_each(thrust::device,
-                     thrust::make_counting_iterator(0),
-                     thrust::make_counting_iterator(nCells),
-                     op);
+    gpuErrorCheck(cudaDeviceSynchronize());
     */
+
     return std::make_pair(toStdVector(dYvf_arr),
                           toStdVector(ddeltaTChem_arr));
 }

diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu
@@ -1,8 +1,6 @@
 #include "gpuMemoryResource.H"
-#include <thrust/device_malloc_allocator.h>
-
-using labelAllocator  = thrust::device_malloc_allocator<gLabel>;
-using scalarAllocator = thrust::device_malloc_allocator<gScalar>;
+#include "device_allocate.H"
+#include "device_free.H"
 
 namespace FoamGpu {
 
@@ -15,39 +13,27 @@ gpuMemoryResource::~gpuMemoryResource() { this->deallocate(); }
 
 void gpuMemoryResource::allocate() {
 
-    labelAllocator  lAllocator;
-    scalarAllocator sAllocator;
-
     for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
-        labelData_[i] =
-            make_raw_pointer(lAllocator.allocate(labelArrayLength()));
+        labelData_[i] = device_allocate<gLabel>(labelArrayLength());
     }
     for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
-        scalarData_[i] =
-            make_raw_pointer(sAllocator.allocate(scalarArrayLength()));
+        scalarData_[i] = device_allocate<gScalar>(scalarArrayLength());
     }
     for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
-        twodScalarData_[i] =
-            make_raw_pointer(sAllocator.allocate(twodScalarArrayLength()));
+        twodScalarData_[i] = device_allocate<gScalar>(twodScalarArrayLength());
     }
 }
 
 void gpuMemoryResource::deallocate() {
 
-    labelAllocator  lAllocator;
-    scalarAllocator sAllocator;
-
     for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
-        auto ptr = make_device_pointer(labelData_[i]);
-        lAllocator.deallocate(ptr, labelArrayLength());
+        device_free(labelData_[i]);
     }
     for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
-        auto ptr = make_device_pointer(scalarData_[i]);
-        sAllocator.deallocate(ptr, scalarArrayLength());
+        device_free(scalarData_[i]);
     }
     for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
-        auto ptr = make_device_pointer(twodScalarData_[i]);
-        sAllocator.deallocate(ptr, twodScalarArrayLength());
+        device_free(twodScalarData_[i]);
     }
 }
 

diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H
@@ -32,7 +32,6 @@ private:
     gpuReaction* reactions_;
 
 
-    void allocate();
     void deallocate();
 
 

diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu
@@ -1,20 +1,27 @@
+#include "thermosAndReactions.H"
 
 #include "error_handling.H"
-#include "thermosAndReactions.H"
+#include "device_allocate.H"
+#include "device_free.H"
+#include "host_device_transfers.H"
 
 namespace FoamGpu {
 
 template <class T>
 static inline T* allocateAndTransfer(const std::vector<T>& t) {
-    T*         ptr;
-    const auto size     = t.size();
-    const auto bytesize = size * sizeof(T);
+
+    T*         ptr = device_allocate<T>(t.size());
+    const auto bytesize = t.size() * sizeof(T);
 
-    CHECK_CUDA_ERROR(cudaMalloc((void**)&ptr, bytesize));
-    CHECK_CUDA_ERROR(
+    gpuErrorCheck(
         cudaMemcpy(ptr, t.data(), bytesize, cudaMemcpyHostToDevice));
 
     return ptr;
+    /*
+    T*         ptr = device_allocate<T>(t.size());
+    host_to_device(t.begin(), t.end(), ptr);
+    return ptr;
+    */
 }
 
 thermosAndReactions::thermosAndReactions
@@ -33,23 +40,11 @@ thermosAndReactions::~thermosAndReactions()
     this->deallocate();
 }
 
-void thermosAndReactions::allocate()
-{
-    CHECK_CUDA_ERROR
-    (
-        cudaMalloc((void**)&thermos_,nThermos_*sizeof(gpuThermo))
-    ); 
-
-    CHECK_CUDA_ERROR
-    (
-        cudaMalloc((void**)&reactions_,nReactions_*sizeof(gpuReaction))
-    );
 
-}
 void thermosAndReactions::deallocate()
 {
-    CHECK_CUDA_ERROR(cudaFree(thermos_));
-    CHECK_CUDA_ERROR(cudaFree(reactions_));
+    device_free(thermos_);
+    device_free(reactions_);
 }
 
 

diff --git a/gpu_chemistry/unittest/testHelpers/test_utilities.H b/gpu_chemistry/unittest/testHelpers/test_utilities.H
@@ -53,15 +53,15 @@ static inline gScalar eval(T t)
 {
 
     gScalar *d_result;
-    CHECK_CUDA_ERROR(cudaMalloc(&d_result, sizeof(gScalar)));
+    gpuErrorCheck(cudaMalloc(&d_result, sizeof(gScalar)));
     on_device<<<1,1>>>(t, d_result);
-    CHECK_LAST_CUDA_ERROR();
-    cudaDeviceSynchronize();
+    gpuErrorCheck(cudaGetLastError())
+    gpuErrorCheek(cudaDeviceSynchronize());
     gScalar h_result;
-    CHECK_CUDA_ERROR(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
-    cudaDeviceSynchronize();
-    CHECK_CUDA_ERROR(cudaFree(d_result));
-    cudaDeviceSynchronize();
+    gpuErrorCheck(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
+    gpuErrorCheck(cudaDeviceSynchronize());
+    gpuErrorCheck(cudaFree(d_result));
+    gpuErrorCheck(cudaDeviceSynchronize());
     return h_result;
 
 }

diff --git a/gpu_utils/common/check_ptr.H b/gpu_utils/common/check_ptr.H
@@ -1,21 +1,6 @@
 #pragma once
-//#include <string_view>
-//#include <string.h>
 #include <stdio.h>
-//#include "cuda_host_dev.H"
 
 
 #define check_ptr(val, name)  if (!val) {printf("null ptr %s", name); assert(0);}
 
-/*
-template<class T>
-static inline CUDA_HOSTDEV void check_ptr(T ptr, std::string_view name)
-{
-    //Note string view may not be null terminated and this is dangerous
-    if (!ptr)
-    {
-        printf("Bad alloc for: %s \n", name.data());
-    }
-
-}
-*/
diff --git a/gpu_utils/common/device_allocate.H b/gpu_utils/common/device_allocate.H
@@ -0,0 +1,12 @@
+#pragma once
+
+#include "error_handling.H"
+
+template<class T>
+static inline T* device_allocate(size_t length){
+
+    T*         ptr;
+    const auto bytesize = length * sizeof(T);
+    gpuErrorCheck(cudaMalloc((void**)&ptr, bytesize));
+    return ptr;
+}
diff --git a/gpu_utils/common/device_free.H b/gpu_utils/common/device_free.H
@@ -0,0 +1,8 @@
+#pragma once
+
+#include "error_handling.H"
+
+template<class T>
+static inline void device_free(T* ptr){
+    gpuErrorCheck(cudaFree(ptr));
+}
diff --git a/gpu_utils/common/error_handling.H b/gpu_utils/common/error_handling.H
@@ -9,35 +9,22 @@
 #include <assert.h>
 #include "cuda_runtime.h"
 
+#define gpuErrorCheck(call) \
+do{ \
+    cudaError_t gpuErr = call; \
+    if(cudaSuccess != gpuErr){ \
+    printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \
+    exit(1); \
+    } \
+}while(0)
+
+
 
-#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
-template <typename T>
-static CUDA_HOSTDEV void check(T err, const char* const func, const char* const file,
-           const int line, bool abort=true)
-{
-    if (err != cudaSuccess)
-    {
-        printf("CUDA Runtime error at: %s %s %s %d\n", cudaGetErrorString(err), file, func, line);
-        if (abort) assert(0);
-    }
-}
-
-#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
-static CUDA_HOSTDEV void checkLast(const char* const file, const int line, bool abort=true)
-{
-    cudaError_t err{cudaGetLastError()};
-    if (err != cudaSuccess)
-    {
-        printf("CUDA Runtime error at: %s %s %d\n", cudaGetErrorString(err), file, line);
-        if (abort) assert(0);
-    }
-}
 
 
 #else
 
-#define CHECK_CUDA_ERROR(val)
-#define CHECK_LAST_CUDA_ERROR()
+#define gpuErrorCheck(val)
 
 
 

diff --git a/gpu_utils/common/for_each_index.H b/gpu_utils/common/for_each_index.H
@@ -0,0 +1,35 @@
+#pragma once
+
+#include "gpu_constants.H"
+#include "error_handling.H"
+
+namespace detail{
+
+
+
+template<class UnaryOperation>
+__global__ void cuda_backend(gLabel n, UnaryOperation op) {
+
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) { op(i); }
+}
+
+}
+
+///
+///@brief Evaluates op(i) for all i in range [0, n[ in parallel.
+///
+///@param op A unary opeartion taking a gLabel index as a parameter. 
+///@param n The maximum i index (non-inclusive).
+///
+template<class UnaryOperation>
+static inline void for_each_index(UnaryOperation op, gLabel n){
+
+    gLabel NTHREADS = 32;
+    gLabel NBLOCKS  = (n + NTHREADS - 1) / NTHREADS;
+    detail::cuda_backend<<<NBLOCKS, NTHREADS>>>(n, op);
+
+    gpuErrorCheck(cudaGetLastError());
+    gpuErrorCheck(cudaDeviceSynchronize());
+
+}
diff --git a/gpu_utils/common/host_device_transfers.H b/gpu_utils/common/host_device_transfers.H
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "error_handling.H"
+#include "thrust/copy.h"
+
+template<class InputIter, class OutputIter>
+static inline void host_to_device(InputIter h_begin, InputIter h_end, OutputIter d_begin){
+
+    auto length = std::distance(h_begin, h_end);
+    using T = typename std::iterator_traits<InputIter>::value_type;
+    using T2 = typename std::iterator_traits<OutputIter>::value_type;
+
+    static_assert(std::is_same_v<T, T2>, "Mismatching types in host_to_device");
+
+    auto bytesize = length * sizeof(T);
+    gpuErrorCheck(
+        cudaMemcpy(d_begin, &(*h_begin), bytesize, cudaMemcpyHostToDevice));
+
+
+}
+
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -32,7 +32,6 @@ private:
		gpuReaction* reactions_;


		void allocate();
		void deallocate();


Expand Down