diff --git a/gpu_chemistry/src/gpuKernelEvaluator/Make/options b/gpu_chemistry/src/gpuKernelEvaluator/Make/options index c39258d..76fa81e 100644 --- a/gpu_chemistry/src/gpuKernelEvaluator/Make/options +++ b/gpu_chemistry/src/gpuKernelEvaluator/Make/options @@ -7,4 +7,5 @@ EXE_INC = \ #LIB_LIBS = -lcudart_static -lcudart LIB_LIBS += -L$(CUDA_LIBS) -lcudart -include ../../hipcc \ No newline at end of file +#include ../../hipcc +include ../../nvcc \ No newline at end of file diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu index 51f69c2..2eb9267 100644 --- a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu +++ b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuKernelEvaluator.cu @@ -4,7 +4,7 @@ #include "cuda_host_dev.H" -#include "error_handling.H" +#include "for_each_index.H" #include "host_device_vectors.H" #include #include //min_element @@ -30,36 +30,10 @@ GpuKernelEvaluator::GpuKernelEvaluator( , solver_(make_gpuODESolver(system_, odeInputs)) , inputs_(odeInputs) , memory_(nCells, nSpecie) { - /* - int num; - CHECK_CUDA_ERROR(cudaGetDeviceCount(&num)); // number of CUDA - devices - - int dev = (nCells % num); - //cudaDeviceProp::canMapHostMemory prop; - //CHECK_CUDA_ERROR(cudaChooseDevice(&dev, &prop)); - - - CHECK_CUDA_ERROR(cudaSetDevice(dev)); - std::cout << "Using device: " << dev << std::endl; - */ - - /* - for (int i = 0; i < num; i++) { - // Query the device properties. - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, i); - std::cout << "Device id: " << i << std::endl; - std::cout << "Device name: " << prop.name << std::endl; - } - */ + } -__global__ void cuda_kernel(gLabel nCells, singleCellSolver op) { - int celli = blockIdx.x * blockDim.x + threadIdx.x; - if (celli < nCells) { op(celli); } -} /* static inline auto parseTimes(const char* label, const std::vector& b) { @@ -115,33 +89,18 @@ GpuKernelEvaluator::computeYNew( singleCellSolver op( deltaT, nSpecie_, ddeltaTChem, dYvf, buffer_span, solver_); + for_each_index(op, nCells); + + + /* gLabel NTHREADS = 32; gLabel NBLOCKS = (nCells + NTHREADS - 1) / NTHREADS; cuda_kernel<<>>(nCells, op); CHECK_LAST_CUDA_ERROR(); - CHECK_CUDA_ERROR(cudaDeviceSynchronize()); - - //// - /* - auto bhost = toStdVector(buffers); - - parseTimes("adaptive", bhost); - parseTimes("Jacobian", bhost); - parseTimes("step1", bhost); - parseTimes("step2", bhost); - parseTimes("step3", bhost); - - */ - - //// - - /* - thrust::for_each(thrust::device, - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(nCells), - op); + gpuErrorCheck(cudaDeviceSynchronize()); */ + return std::make_pair(toStdVector(dYvf_arr), toStdVector(ddeltaTChem_arr)); } diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu index 8868a01..f1f0b94 100644 --- a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu +++ b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/gpuMemoryResource.cu @@ -1,8 +1,6 @@ #include "gpuMemoryResource.H" -#include - -using labelAllocator = thrust::device_malloc_allocator; -using scalarAllocator = thrust::device_malloc_allocator; +#include "device_allocate.H" +#include "device_free.H" namespace FoamGpu { @@ -15,39 +13,27 @@ gpuMemoryResource::~gpuMemoryResource() { this->deallocate(); } void gpuMemoryResource::allocate() { - labelAllocator lAllocator; - scalarAllocator sAllocator; - for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) { - labelData_[i] = - make_raw_pointer(lAllocator.allocate(labelArrayLength())); + labelData_[i] = device_allocate(labelArrayLength()); } for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) { - scalarData_[i] = - make_raw_pointer(sAllocator.allocate(scalarArrayLength())); + scalarData_[i] = device_allocate(scalarArrayLength()); } for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) { - twodScalarData_[i] = - make_raw_pointer(sAllocator.allocate(twodScalarArrayLength())); + twodScalarData_[i] = device_allocate(twodScalarArrayLength()); } } void gpuMemoryResource::deallocate() { - labelAllocator lAllocator; - scalarAllocator sAllocator; - for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) { - auto ptr = make_device_pointer(labelData_[i]); - lAllocator.deallocate(ptr, labelArrayLength()); + device_free(labelData_[i]); } for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) { - auto ptr = make_device_pointer(scalarData_[i]); - sAllocator.deallocate(ptr, scalarArrayLength()); + device_free(scalarData_[i]); } for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) { - auto ptr = make_device_pointer(twodScalarData_[i]); - sAllocator.deallocate(ptr, twodScalarArrayLength()); + device_free(twodScalarData_[i]); } } diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H index 6638238..8c361bc 100644 --- a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H +++ b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.H @@ -32,7 +32,6 @@ private: gpuReaction* reactions_; - void allocate(); void deallocate(); diff --git a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu index a07a5d0..1a47a79 100644 --- a/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu +++ b/gpu_chemistry/src/gpuKernelEvaluator/gpuKernelEvaluator/thermosAndReactions.cu @@ -1,20 +1,27 @@ +#include "thermosAndReactions.H" #include "error_handling.H" -#include "thermosAndReactions.H" +#include "device_allocate.H" +#include "device_free.H" +#include "host_device_transfers.H" namespace FoamGpu { template static inline T* allocateAndTransfer(const std::vector& t) { - T* ptr; - const auto size = t.size(); - const auto bytesize = size * sizeof(T); + + T* ptr = device_allocate(t.size()); + const auto bytesize = t.size() * sizeof(T); - CHECK_CUDA_ERROR(cudaMalloc((void**)&ptr, bytesize)); - CHECK_CUDA_ERROR( + gpuErrorCheck( cudaMemcpy(ptr, t.data(), bytesize, cudaMemcpyHostToDevice)); return ptr; + /* + T* ptr = device_allocate(t.size()); + host_to_device(t.begin(), t.end(), ptr); + return ptr; + */ } thermosAndReactions::thermosAndReactions @@ -33,23 +40,11 @@ thermosAndReactions::~thermosAndReactions() this->deallocate(); } -void thermosAndReactions::allocate() -{ - CHECK_CUDA_ERROR - ( - cudaMalloc((void**)&thermos_,nThermos_*sizeof(gpuThermo)) - ); - - CHECK_CUDA_ERROR - ( - cudaMalloc((void**)&reactions_,nReactions_*sizeof(gpuReaction)) - ); -} void thermosAndReactions::deallocate() { - CHECK_CUDA_ERROR(cudaFree(thermos_)); - CHECK_CUDA_ERROR(cudaFree(reactions_)); + device_free(thermos_); + device_free(reactions_); } diff --git a/gpu_chemistry/unittest/testHelpers/test_utilities.H b/gpu_chemistry/unittest/testHelpers/test_utilities.H index e73a260..d90d35d 100644 --- a/gpu_chemistry/unittest/testHelpers/test_utilities.H +++ b/gpu_chemistry/unittest/testHelpers/test_utilities.H @@ -53,15 +53,15 @@ static inline gScalar eval(T t) { gScalar *d_result; - CHECK_CUDA_ERROR(cudaMalloc(&d_result, sizeof(gScalar))); + gpuErrorCheck(cudaMalloc(&d_result, sizeof(gScalar))); on_device<<<1,1>>>(t, d_result); - CHECK_LAST_CUDA_ERROR(); - cudaDeviceSynchronize(); + gpuErrorCheck(cudaGetLastError()) + gpuErrorCheek(cudaDeviceSynchronize()); gScalar h_result; - CHECK_CUDA_ERROR(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost)); - cudaDeviceSynchronize(); - CHECK_CUDA_ERROR(cudaFree(d_result)); - cudaDeviceSynchronize(); + gpuErrorCheck(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost)); + gpuErrorCheck(cudaDeviceSynchronize()); + gpuErrorCheck(cudaFree(d_result)); + gpuErrorCheck(cudaDeviceSynchronize()); return h_result; } diff --git a/gpu_utils/common/check_ptr.H b/gpu_utils/common/check_ptr.H index 4ec6589..dd53426 100644 --- a/gpu_utils/common/check_ptr.H +++ b/gpu_utils/common/check_ptr.H @@ -1,21 +1,6 @@ #pragma once -//#include -//#include #include -//#include "cuda_host_dev.H" #define check_ptr(val, name) if (!val) {printf("null ptr %s", name); assert(0);} -/* -template -static inline CUDA_HOSTDEV void check_ptr(T ptr, std::string_view name) -{ - //Note string view may not be null terminated and this is dangerous - if (!ptr) - { - printf("Bad alloc for: %s \n", name.data()); - } - -} -*/ \ No newline at end of file diff --git a/gpu_utils/common/device_allocate.H b/gpu_utils/common/device_allocate.H new file mode 100644 index 0000000..532ff98 --- /dev/null +++ b/gpu_utils/common/device_allocate.H @@ -0,0 +1,12 @@ +#pragma once + +#include "error_handling.H" + +template +static inline T* device_allocate(size_t length){ + + T* ptr; + const auto bytesize = length * sizeof(T); + gpuErrorCheck(cudaMalloc((void**)&ptr, bytesize)); + return ptr; +} \ No newline at end of file diff --git a/gpu_utils/common/device_free.H b/gpu_utils/common/device_free.H new file mode 100644 index 0000000..2df74b8 --- /dev/null +++ b/gpu_utils/common/device_free.H @@ -0,0 +1,8 @@ +#pragma once + +#include "error_handling.H" + +template +static inline void device_free(T* ptr){ + gpuErrorCheck(cudaFree(ptr)); +} \ No newline at end of file diff --git a/gpu_utils/common/error_handling.H b/gpu_utils/common/error_handling.H index 9123b80..c2bac0e 100644 --- a/gpu_utils/common/error_handling.H +++ b/gpu_utils/common/error_handling.H @@ -9,35 +9,22 @@ #include #include "cuda_runtime.h" +#define gpuErrorCheck(call) \ +do{ \ + cudaError_t gpuErr = call; \ + if(cudaSuccess != gpuErr){ \ + printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \ + exit(1); \ + } \ +}while(0) + + -#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__) -template -static CUDA_HOSTDEV void check(T err, const char* const func, const char* const file, - const int line, bool abort=true) -{ - if (err != cudaSuccess) - { - printf("CUDA Runtime error at: %s %s %s %d\n", cudaGetErrorString(err), file, func, line); - if (abort) assert(0); - } -} - -#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__) -static CUDA_HOSTDEV void checkLast(const char* const file, const int line, bool abort=true) -{ - cudaError_t err{cudaGetLastError()}; - if (err != cudaSuccess) - { - printf("CUDA Runtime error at: %s %s %d\n", cudaGetErrorString(err), file, line); - if (abort) assert(0); - } -} #else -#define CHECK_CUDA_ERROR(val) -#define CHECK_LAST_CUDA_ERROR() +#define gpuErrorCheck(val) diff --git a/gpu_utils/common/for_each_index.H b/gpu_utils/common/for_each_index.H new file mode 100644 index 0000000..186b29f --- /dev/null +++ b/gpu_utils/common/for_each_index.H @@ -0,0 +1,35 @@ +#pragma once + +#include "gpu_constants.H" +#include "error_handling.H" + +namespace detail{ + + + +template +__global__ void cuda_backend(gLabel n, UnaryOperation op) { + + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { op(i); } +} + +} + +/// +///@brief Evaluates op(i) for all i in range [0, n[ in parallel. +/// +///@param op A unary opeartion taking a gLabel index as a parameter. +///@param n The maximum i index (non-inclusive). +/// +template +static inline void for_each_index(UnaryOperation op, gLabel n){ + + gLabel NTHREADS = 32; + gLabel NBLOCKS = (n + NTHREADS - 1) / NTHREADS; + detail::cuda_backend<<>>(n, op); + + gpuErrorCheck(cudaGetLastError()); + gpuErrorCheck(cudaDeviceSynchronize()); + +} \ No newline at end of file diff --git a/gpu_utils/common/host_device_transfers.H b/gpu_utils/common/host_device_transfers.H new file mode 100644 index 0000000..4a88c50 --- /dev/null +++ b/gpu_utils/common/host_device_transfers.H @@ -0,0 +1,22 @@ +#pragma once + +#include "error_handling.H" +#include "thrust/copy.h" + +template +static inline void host_to_device(InputIter h_begin, InputIter h_end, OutputIter d_begin){ + + auto length = std::distance(h_begin, h_end); + using T = typename std::iterator_traits::value_type; + using T2 = typename std::iterator_traits::value_type; + + static_assert(std::is_same_v, "Mismatching types in host_to_device"); + + auto bytesize = length * sizeof(T); + gpuErrorCheck( + cudaMemcpy(d_begin, &(*h_begin), bytesize, cudaMemcpyHostToDevice)); + + +} + +