Skip to content

Commit

Permalink
started separating the gpu backend operations
Browse files Browse the repository at this point in the history
  • Loading branch information
hamsteri15 committed May 28, 2024
1 parent 95d0f05 commit f426525
Show file tree
Hide file tree
Showing 12 changed files with 128 additions and 139 deletions.
3 changes: 2 additions & 1 deletion gpu_chemistry/src/gpuKernelEvaluator/Make/options
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ EXE_INC = \
#LIB_LIBS = -lcudart_static -lcudart
LIB_LIBS += -L$(CUDA_LIBS) -lcudart

include ../../hipcc
#include ../../hipcc
include ../../nvcc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

#include "cuda_host_dev.H"

#include "error_handling.H"
#include "for_each_index.H"
#include "host_device_vectors.H"
#include <thrust/execution_policy.h>
#include <thrust/extrema.h> //min_element
Expand All @@ -30,36 +30,10 @@ GpuKernelEvaluator::GpuKernelEvaluator(
, solver_(make_gpuODESolver(system_, odeInputs))
, inputs_(odeInputs)
, memory_(nCells, nSpecie) {
/*
int num;
CHECK_CUDA_ERROR(cudaGetDeviceCount(&num)); // number of CUDA
devices
int dev = (nCells % num);
//cudaDeviceProp::canMapHostMemory prop;
//CHECK_CUDA_ERROR(cudaChooseDevice(&dev, &prop));
CHECK_CUDA_ERROR(cudaSetDevice(dev));
std::cout << "Using device: " << dev << std::endl;
*/

/*
for (int i = 0; i < num; i++) {
// Query the device properties.
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
std::cout << "Device id: " << i << std::endl;
std::cout << "Device name: " << prop.name << std::endl;
}
*/

}

__global__ void cuda_kernel(gLabel nCells, singleCellSolver op) {

int celli = blockIdx.x * blockDim.x + threadIdx.x;
if (celli < nCells) { op(celli); }
}
/*
static inline auto parseTimes(const char* label,
const std::vector<gpuBuffer>& b) {
Expand Down Expand Up @@ -115,33 +89,18 @@ GpuKernelEvaluator::computeYNew(
singleCellSolver op(
deltaT, nSpecie_, ddeltaTChem, dYvf, buffer_span, solver_);

for_each_index(op, nCells);


/*
gLabel NTHREADS = 32;
gLabel NBLOCKS = (nCells + NTHREADS - 1) / NTHREADS;
cuda_kernel<<<NBLOCKS, NTHREADS>>>(nCells, op);
CHECK_LAST_CUDA_ERROR();
CHECK_CUDA_ERROR(cudaDeviceSynchronize());

////
/*
auto bhost = toStdVector(buffers);
parseTimes("adaptive", bhost);
parseTimes("Jacobian", bhost);
parseTimes("step1", bhost);
parseTimes("step2", bhost);
parseTimes("step3", bhost);
*/

////

/*
thrust::for_each(thrust::device,
thrust::make_counting_iterator(0),
thrust::make_counting_iterator(nCells),
op);
gpuErrorCheck(cudaDeviceSynchronize());
*/

return std::make_pair(toStdVector(dYvf_arr),
toStdVector(ddeltaTChem_arr));
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
#include "gpuMemoryResource.H"
#include <thrust/device_malloc_allocator.h>

using labelAllocator = thrust::device_malloc_allocator<gLabel>;
using scalarAllocator = thrust::device_malloc_allocator<gScalar>;
#include "device_allocate.H"
#include "device_free.H"

namespace FoamGpu {

Expand All @@ -15,39 +13,27 @@ gpuMemoryResource::~gpuMemoryResource() { this->deallocate(); }

void gpuMemoryResource::allocate() {

labelAllocator lAllocator;
scalarAllocator sAllocator;

for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
labelData_[i] =
make_raw_pointer(lAllocator.allocate(labelArrayLength()));
labelData_[i] = device_allocate<gLabel>(labelArrayLength());
}
for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
scalarData_[i] =
make_raw_pointer(sAllocator.allocate(scalarArrayLength()));
scalarData_[i] = device_allocate<gScalar>(scalarArrayLength());
}
for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
twodScalarData_[i] =
make_raw_pointer(sAllocator.allocate(twodScalarArrayLength()));
twodScalarData_[i] = device_allocate<gScalar>(twodScalarArrayLength());
}
}

void gpuMemoryResource::deallocate() {

labelAllocator lAllocator;
scalarAllocator sAllocator;

for (gLabel i = 0; i < N_LABEL_ARRAYS; ++i) {
auto ptr = make_device_pointer(labelData_[i]);
lAllocator.deallocate(ptr, labelArrayLength());
device_free(labelData_[i]);
}
for (gLabel i = 0; i < N_SCALAR_ARRAYS; ++i) {
auto ptr = make_device_pointer(scalarData_[i]);
sAllocator.deallocate(ptr, scalarArrayLength());
device_free(scalarData_[i]);
}
for (gLabel i = 0; i < N_TWOD_SCALAR_ARRAYS; ++i) {
auto ptr = make_device_pointer(twodScalarData_[i]);
sAllocator.deallocate(ptr, twodScalarArrayLength());
device_free(twodScalarData_[i]);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ private:
gpuReaction* reactions_;


void allocate();
void deallocate();


Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,27 @@
#include "thermosAndReactions.H"

#include "error_handling.H"
#include "thermosAndReactions.H"
#include "device_allocate.H"
#include "device_free.H"
#include "host_device_transfers.H"

namespace FoamGpu {

template <class T>
static inline T* allocateAndTransfer(const std::vector<T>& t) {
T* ptr;
const auto size = t.size();
const auto bytesize = size * sizeof(T);

T* ptr = device_allocate<T>(t.size());
const auto bytesize = t.size() * sizeof(T);

CHECK_CUDA_ERROR(cudaMalloc((void**)&ptr, bytesize));
CHECK_CUDA_ERROR(
gpuErrorCheck(
cudaMemcpy(ptr, t.data(), bytesize, cudaMemcpyHostToDevice));

return ptr;
/*
T* ptr = device_allocate<T>(t.size());
host_to_device(t.begin(), t.end(), ptr);
return ptr;
*/
}

thermosAndReactions::thermosAndReactions
Expand All @@ -33,23 +40,11 @@ thermosAndReactions::~thermosAndReactions()
this->deallocate();
}

void thermosAndReactions::allocate()
{
CHECK_CUDA_ERROR
(
cudaMalloc((void**)&thermos_,nThermos_*sizeof(gpuThermo))
);

CHECK_CUDA_ERROR
(
cudaMalloc((void**)&reactions_,nReactions_*sizeof(gpuReaction))
);

}
void thermosAndReactions::deallocate()
{
CHECK_CUDA_ERROR(cudaFree(thermos_));
CHECK_CUDA_ERROR(cudaFree(reactions_));
device_free(thermos_);
device_free(reactions_);
}


Expand Down
14 changes: 7 additions & 7 deletions gpu_chemistry/unittest/testHelpers/test_utilities.H
Original file line number Diff line number Diff line change
Expand Up @@ -53,15 +53,15 @@ static inline gScalar eval(T t)
{

gScalar *d_result;
CHECK_CUDA_ERROR(cudaMalloc(&d_result, sizeof(gScalar)));
gpuErrorCheck(cudaMalloc(&d_result, sizeof(gScalar)));
on_device<<<1,1>>>(t, d_result);
CHECK_LAST_CUDA_ERROR();
cudaDeviceSynchronize();
gpuErrorCheck(cudaGetLastError())
gpuErrorCheek(cudaDeviceSynchronize());
gScalar h_result;
CHECK_CUDA_ERROR(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
cudaDeviceSynchronize();
CHECK_CUDA_ERROR(cudaFree(d_result));
cudaDeviceSynchronize();
gpuErrorCheck(cudaMemcpy(&h_result, d_result, sizeof(gScalar), cudaMemcpyDeviceToHost));
gpuErrorCheck(cudaDeviceSynchronize());
gpuErrorCheck(cudaFree(d_result));
gpuErrorCheck(cudaDeviceSynchronize());
return h_result;

}
Expand Down
15 changes: 0 additions & 15 deletions gpu_utils/common/check_ptr.H
Original file line number Diff line number Diff line change
@@ -1,21 +1,6 @@
#pragma once
//#include <string_view>
//#include <string.h>
#include <stdio.h>
//#include "cuda_host_dev.H"


#define check_ptr(val, name) if (!val) {printf("null ptr %s", name); assert(0);}

/*
template<class T>
static inline CUDA_HOSTDEV void check_ptr(T ptr, std::string_view name)
{
//Note string view may not be null terminated and this is dangerous
if (!ptr)
{
printf("Bad alloc for: %s \n", name.data());
}
}
*/
12 changes: 12 additions & 0 deletions gpu_utils/common/device_allocate.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include "error_handling.H"

template<class T>
static inline T* device_allocate(size_t length){

T* ptr;
const auto bytesize = length * sizeof(T);
gpuErrorCheck(cudaMalloc((void**)&ptr, bytesize));
return ptr;
}
8 changes: 8 additions & 0 deletions gpu_utils/common/device_free.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#pragma once

#include "error_handling.H"

template<class T>
static inline void device_free(T* ptr){
gpuErrorCheck(cudaFree(ptr));
}
35 changes: 11 additions & 24 deletions gpu_utils/common/error_handling.H
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,22 @@
#include <assert.h>
#include "cuda_runtime.h"

#define gpuErrorCheck(call) \
do{ \
cudaError_t gpuErr = call; \
if(cudaSuccess != gpuErr){ \
printf("GPU Error - %s:%d: '%s'\n", __FILE__, __LINE__, cudaGetErrorString(gpuErr)); \
exit(1); \
} \
}while(0)



#define CHECK_CUDA_ERROR(val) check((val), #val, __FILE__, __LINE__)
template <typename T>
static CUDA_HOSTDEV void check(T err, const char* const func, const char* const file,
const int line, bool abort=true)
{
if (err != cudaSuccess)
{
printf("CUDA Runtime error at: %s %s %s %d\n", cudaGetErrorString(err), file, func, line);
if (abort) assert(0);
}
}

#define CHECK_LAST_CUDA_ERROR() checkLast(__FILE__, __LINE__)
static CUDA_HOSTDEV void checkLast(const char* const file, const int line, bool abort=true)
{
cudaError_t err{cudaGetLastError()};
if (err != cudaSuccess)
{
printf("CUDA Runtime error at: %s %s %d\n", cudaGetErrorString(err), file, line);
if (abort) assert(0);
}
}


#else

#define CHECK_CUDA_ERROR(val)
#define CHECK_LAST_CUDA_ERROR()
#define gpuErrorCheck(val)



Expand Down
35 changes: 35 additions & 0 deletions gpu_utils/common/for_each_index.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include "gpu_constants.H"
#include "error_handling.H"

namespace detail{



template<class UnaryOperation>
__global__ void cuda_backend(gLabel n, UnaryOperation op) {

int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) { op(i); }
}

}

///
///@brief Evaluates op(i) for all i in range [0, n[ in parallel.
///
///@param op A unary opeartion taking a gLabel index as a parameter.
///@param n The maximum i index (non-inclusive).
///
template<class UnaryOperation>
static inline void for_each_index(UnaryOperation op, gLabel n){

gLabel NTHREADS = 32;
gLabel NBLOCKS = (n + NTHREADS - 1) / NTHREADS;
detail::cuda_backend<<<NBLOCKS, NTHREADS>>>(n, op);

gpuErrorCheck(cudaGetLastError());
gpuErrorCheck(cudaDeviceSynchronize());

}
22 changes: 22 additions & 0 deletions gpu_utils/common/host_device_transfers.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#pragma once

#include "error_handling.H"
#include "thrust/copy.h"

template<class InputIter, class OutputIter>
static inline void host_to_device(InputIter h_begin, InputIter h_end, OutputIter d_begin){

auto length = std::distance(h_begin, h_end);
using T = typename std::iterator_traits<InputIter>::value_type;
using T2 = typename std::iterator_traits<OutputIter>::value_type;

static_assert(std::is_same_v<T, T2>, "Mismatching types in host_to_device");

auto bytesize = length * sizeof(T);
gpuErrorCheck(
cudaMemcpy(d_begin, &(*h_begin), bytesize, cudaMemcpyHostToDevice));


}


0 comments on commit f426525

Please sign in to comment.