diff --git a/CMakeLists.txt b/CMakeLists.txt index 36ab2f37..5715e6e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") endif() option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF) -option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON) +option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON) +option(RESOLVE_USE_GPU "Use GPU device for computations" ON) +option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON) set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") set(CMAKE_MACOSX_RPATH 1) @@ -47,12 +49,6 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) # TODO: Set up clang-format #include(./cmake/clang-format) -# Configure CUDA -include(CheckLanguage) -enable_language(CUDA) -check_language(CUDA) - - if (RESOLVE_USE_KLU) include(FindKLU) if(NOT KLU_LIBRARY) @@ -63,22 +59,42 @@ else() message(STATUS "Not using SuiteSparse KLU") endif() -if(NOT DEFINED CMAKE_CUDA_STANDARD) - set(CMAKE_CUDA_STANDARD 11) - set(CMAKE_CUDA_STANDARD_REQUIRED ON) -endif() +include(CheckLanguage) + +# Configure CUDA +if(RESOLVE_USE_CUDA) + enable_language(CUDA) + check_language(CUDA) + + if(NOT DEFINED CMAKE_CUDA_STANDARD) + set(CMAKE_CUDA_STANDARD 11) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + endif() -if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) - set(CMAKE_CUDA_ARCHITECTURES 60 CACHE STRING "Selects CUDA architectures") + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + set(CMAKE_CUDA_ARCHITECTURES 60 CACHE STRING "Selects CUDA architectures") + endif() + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") + + include(ReSolveFindCudaLibraries) +else() + message(STATUS "Not using CUDA") endif() -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda") +# The binary dir is already a global include directory +configure_file( + ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in + ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp) +# include build directory for Fortran name mangling header +include_directories(${CMAKE_BINARY_DIR}) -# Link in required cuda dependencies -#find_package(CUDAToolkit REQUIRED) +install( + FILES ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp + DESTINATION include/resolve + ) -include(ReSolveFindCudaLibraries) include_directories(${CMAKE_SOURCE_DIR}) @@ -114,8 +130,10 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake" "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfigVersion.cmake" DESTINATION share/resolve/cmake) -# Add examples -add_subdirectory(examples) +# Add examples (for now only CUDA examples are available) +if(RESOLVE_USE_CUDA) + add_subdirectory(examples) +endif(RESOLVE_USE_CUDA) # Add tests add_subdirectory(tests) diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 49b50997..675fc11f 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -19,14 +19,8 @@ set(ReSolve_SRC ) -set(ReSolve_SRC_CUDA - cudaKernels.cu - memoryUtils.cu -) - set(ReSolve_HEADER_INSTALL Common.hpp - cudaKernels.h cusolver_defs.hpp LinAlgWorkspace.hpp LinSolver.hpp @@ -37,18 +31,18 @@ set(ReSolve_HEADER_INSTALL RefactorizationSolver.hpp SystemSolver.hpp GramSchmidt.hpp - memoryUtils.hpp + MemoryUtils.hpp ) -set_source_files_properties(${ReSolve_SRC_CUDA} PROPERTIES LANGUAGE CUDA) +# If GPU support is not enabled, add dummy device backend +if(NOT RESOLVE_USE_GPU) + add_subdirectory(cpu) +endif() # First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) -add_library(resolve_backend_cuda SHARED ${ReSolve_SRC_CUDA}) -target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda) -target_include_directories(resolve_backend_cuda INTERFACE - $ - $ -) +if(RESOLVE_USE_CUDA) + add_subdirectory(cuda) +endif() # Next build vector and matrix objects that may use this backend. add_subdirectory(vector) @@ -57,9 +51,33 @@ add_subdirectory(matrix) # Build shared library ReSolve add_library(resolve_tpl INTERFACE) -target_link_libraries(resolve_tpl INTERFACE resolve_cuda KLU) -install(TARGETS resolve_matrix resolve_vector resolve_backend_cuda resolve_logger resolve_tpl EXPORT ReSolveTargets) +if(RESOLVE_USE_KLU) + target_link_libraries(resolve_tpl INTERFACE KLU) +endif(RESOLVE_USE_KLU) + +if(RESOLVE_USE_CUDA) + target_link_libraries(resolve_tpl INTERFACE resolve_cuda) +endif(RESOLVE_USE_CUDA) + + +set(ReSolve_Targets_List + resolve_matrix + resolve_vector + resolve_logger + resolve_tpl +) + +if(RESOLVE_USE_GPU) + if(RESOLVE_USE_CUDA) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) + endif() +else(RESOLVE_USE_GPU) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) +endif(RESOLVE_USE_GPU) + + +install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets) add_library(ReSolve SHARED ${ReSolve_SRC}) @@ -69,7 +87,7 @@ target_include_directories(ReSolve INTERFACE ) # TODO: Make this PRIVATE dependency (requires refactoring ReSolve code) -target_link_libraries(ReSolve PUBLIC resolve_matrix resolve_vector resolve_backend_cuda resolve_logger resolve_tpl) +target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List}) install(TARGETS ReSolve EXPORT ReSolveTargets diff --git a/resolve/LinAlgWorkspace.cpp b/resolve/LinAlgWorkspace.cpp index fc38a2ec..9dee78c5 100644 --- a/resolve/LinAlgWorkspace.cpp +++ b/resolve/LinAlgWorkspace.cpp @@ -1,4 +1,3 @@ -#include #include "LinAlgWorkspace.hpp" namespace ReSolve @@ -24,8 +23,8 @@ namespace ReSolve LinAlgWorkspaceCUDA::~LinAlgWorkspaceCUDA() { - if (buffer_spmv_ != nullptr) deleteOnDevice(buffer_spmv_); - if (buffer_1norm_ != nullptr) deleteOnDevice(buffer_1norm_); + if (buffer_spmv_ != nullptr) mem_.deleteOnDevice(buffer_spmv_); + if (buffer_1norm_ != nullptr) mem_.deleteOnDevice(buffer_1norm_); cusparseDestroy(handle_cusparse_); cusolverSpDestroy(handle_cusolversp_); cublasDestroy(handle_cublas_); diff --git a/resolve/LinAlgWorkspace.hpp b/resolve/LinAlgWorkspace.hpp index 7fe83aa9..e5c79580 100644 --- a/resolve/LinAlgWorkspace.hpp +++ b/resolve/LinAlgWorkspace.hpp @@ -4,6 +4,8 @@ #include "cusparse.h" #include "cusolverSp.h" +#include + namespace ReSolve { class LinAlgWorkspace @@ -11,7 +13,8 @@ namespace ReSolve public: LinAlgWorkspace(); ~LinAlgWorkspace(); - private: + protected: + MemoryHandler mem_; }; diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp index e8130610..63413c5b 100644 --- a/resolve/LinSolverDirectCuSolverGLU.cpp +++ b/resolve/LinSolverDirectCuSolverGLU.cpp @@ -1,6 +1,6 @@ #include // includes memcpy #include -#include + #include #include #include "LinSolverDirectCuSolverGLU.hpp" @@ -14,7 +14,7 @@ namespace ReSolve LinSolverDirectCuSolverGLU::~LinSolverDirectCuSolverGLU() { - deleteOnDevice(glu_buffer_); + mem_.deleteOnDevice(glu_buffer_); cusparseDestroyMatDescr(descr_M_); cusparseDestroyMatDescr(descr_A_); cusolverSpDestroyGluInfo(info_M_); @@ -64,7 +64,7 @@ namespace ReSolve status_cusolver_ = cusolverSpDgluBufferSize(handle_cusolversp_, info_M_, &buffer_size); error_sum += status_cusolver_; - allocateBufferOnDevice(&glu_buffer_, buffer_size); + mem_.allocateBufferOnDevice(&glu_buffer_, buffer_size); status_cusolver_ = cusolverSpDgluAnalysis(handle_cusolversp_, info_M_, glu_buffer_); error_sum += status_cusolver_; diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp index 657118fd..d76e1921 100644 --- a/resolve/LinSolverDirectCuSolverGLU.hpp +++ b/resolve/LinSolverDirectCuSolverGLU.hpp @@ -3,6 +3,7 @@ #include #include "LinSolver.hpp" #include "cusolver_defs.hpp" +#include namespace ReSolve { @@ -45,5 +46,7 @@ namespace ReSolve void* glu_buffer_; double r_nrminf_; int ite_refine_succ_; + + MemoryHandler mem_; ///< Device memory manager object }; } diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp index 602d6ec1..d51218cc 100644 --- a/resolve/LinSolverDirectCuSolverRf.cpp +++ b/resolve/LinSolverDirectCuSolverRf.cpp @@ -1,4 +1,3 @@ -#include #include #include #include "LinSolverDirectCuSolverRf.hpp" @@ -13,9 +12,9 @@ namespace ReSolve LinSolverDirectCuSolverRf::~LinSolverDirectCuSolverRf() { cusolverRfDestroy(handle_cusolverrf_); - deleteOnDevice(d_P_); - deleteOnDevice(d_Q_); - deleteOnDevice(d_T_); + mem_.deleteOnDevice(d_P_); + mem_.deleteOnDevice(d_Q_); + mem_.deleteOnDevice(d_T_); } int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q) @@ -24,12 +23,12 @@ namespace ReSolve int error_sum = 0; this->A_ = (matrix::Csr*) A; index_type n = A_->getNumRows(); - allocateArrayOnDevice(&d_P_, n); - allocateArrayOnDevice(&d_Q_, n); - allocateArrayOnDevice(&d_T_, n); + mem_.allocateArrayOnDevice(&d_P_, n); + mem_.allocateArrayOnDevice(&d_Q_, n); + mem_.allocateArrayOnDevice(&d_T_, n); - copyArrayHostToDevice(d_P_, P, n); - copyArrayHostToDevice(d_Q_, Q, n); + mem_.copyArrayHostToDevice(d_P_, P, n); + mem_.copyArrayHostToDevice(d_Q_, Q, n); status_cusolverrf_ = cusolverRfSetResetValuesFastMode(handle_cusolverrf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON); @@ -52,7 +51,7 @@ namespace ReSolve handle_cusolverrf_); error_sum += status_cusolverrf_; - deviceSynchronize(); + mem_.deviceSynchronize(); status_cusolverrf_ = cusolverRfAnalyze(handle_cusolverrf_); error_sum += status_cusolverrf_; @@ -85,7 +84,7 @@ namespace ReSolve handle_cusolverrf_); error_sum += status_cusolverrf_; - deviceSynchronize(); + mem_.deviceSynchronize(); status_cusolverrf_ = cusolverRfRefactor(handle_cusolverrf_); error_sum += status_cusolverrf_; diff --git a/resolve/LinSolverDirectCuSolverRf.hpp b/resolve/LinSolverDirectCuSolverRf.hpp index 478dfaa6..f0ee755e 100644 --- a/resolve/LinSolverDirectCuSolverRf.hpp +++ b/resolve/LinSolverDirectCuSolverRf.hpp @@ -2,6 +2,7 @@ #include "Common.hpp" #include "LinSolver.hpp" #include "cusolverRf.h" +#include namespace ReSolve { @@ -40,5 +41,7 @@ namespace ReSolve index_type* d_P_; index_type* d_Q_; real_type* d_T_; + + MemoryHandler mem_; ///< Device memory manager object }; } diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index 28b89334..2a6da732 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include "LinSolverIterativeFGMRES.hpp" @@ -170,7 +169,7 @@ namespace ReSolve vec_v->setData( d_V_->getVectorData(i, "cuda"), "cuda"); vec_z->setData( d_Z_->getVectorData(i, "cuda"), "cuda"); this->precV(vec_v, vec_z); - deviceSynchronize(); + mem_.deviceSynchronize(); // V_{i+1}=A*Z_i diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp index fb0baec9..8b2c722d 100644 --- a/resolve/LinSolverIterativeFGMRES.hpp +++ b/resolve/LinSolverIterativeFGMRES.hpp @@ -70,5 +70,7 @@ namespace ReSolve real_type final_residual_norm_; real_type initial_residual_norm_; index_type fgmres_iters_; + + MemoryHandler mem_; ///< Device memory manager object }; } diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp new file mode 100644 index 00000000..77c2a7af --- /dev/null +++ b/resolve/MemoryUtils.hpp @@ -0,0 +1,68 @@ +#pragma once + +#include + +namespace ReSolve +{ + /** + * @class MemoryUtils + * + * @brief Provides basic memory allocation, free and copy functions. + * + * This class provedes abstractions for memory management functiosn for + * different GPU programming models. + * + * @tparam Policy - Memory management policy (vendor specific) + * + * @author Slaven Peles + */ + template + class MemoryUtils + { + public: + MemoryUtils() = default; + ~MemoryUtils() = default; + + void deviceSynchronize(); + int getLastDeviceError(); + int deleteOnDevice(void* v); + + template + int allocateArrayOnDevice(T** v, I n); + + template + int allocateBufferOnDevice(T** v, I n); + + template + int setZeroArrayOnDevice(T* v, I n); + + template + int copyArrayDeviceToHost(T* dst, const T* src, I n); + + template + int copyArrayDeviceToDevice(T* dst, const T* src, I n); + + template + int copyArrayHostToDevice(T* dst, const T* src, I n); + }; + +} // namespace ReSolve + +// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager. +#ifdef RESOLVE_USE_GPU + +#if defined RESOLVE_USE_CUDA +#include +using MemoryHandler = ReSolve::MemoryUtils; +#elif defined RESOLVE_USE_HIP +#error HIP support requested, but not available! Probably a bug in CMake configuration. +#endif + +#else + +// If no GPU support is present, set device memory manager to a dummy object. +#include +using MemoryHandler = ReSolve::MemoryUtils; + +#endif + diff --git a/resolve/MemoryUtils.tpp b/resolve/MemoryUtils.tpp new file mode 100644 index 00000000..936a3336 --- /dev/null +++ b/resolve/MemoryUtils.tpp @@ -0,0 +1,75 @@ +/** + * @file MemoryUtils.tpp + * + * Contains implementation of memory utility functions wrappers. + * All it does it calls vendor specific functions frm an abstract interface. + * + * @author Slaven Peles + */ + +#pragma once + + +namespace ReSolve +{ + template + void MemoryUtils::deviceSynchronize() + { + Policy::deviceSynchronize(); + } + + template + int MemoryUtils::getLastDeviceError() + { + return Policy::getLastDeviceError(); + } + + template + int MemoryUtils::deleteOnDevice(void* v) + { + return Policy::deleteOnDevice(v); + } + + template + template + int MemoryUtils::allocateArrayOnDevice(T** v, I n) + { + return Policy::template allocateArrayOnDevice(v, n); + } + + template + template + int MemoryUtils::allocateBufferOnDevice(T** v, I n) + { + return Policy::template allocateBufferOnDevice(v, n); + } + + template + template + int MemoryUtils::setZeroArrayOnDevice(T* v, I n) + { + return Policy::template setZeroArrayOnDevice(v, n); + } + + template + template + int MemoryUtils::copyArrayDeviceToHost(T* dst, const T* src, I n) + { + return Policy::template copyArrayDeviceToHost(dst, src, n); + } + + template + template + int MemoryUtils::copyArrayDeviceToDevice(T* dst, const T* src, I n) + { + return Policy::template copyArrayDeviceToDevice(dst, src, n); + } + + template + template + int MemoryUtils::copyArrayHostToDevice(T* dst, const T* src, I n) + { + return Policy::template copyArrayHostToDevice(dst, src, n); + } + +} // namespace ReSolve \ No newline at end of file diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt new file mode 100644 index 00000000..cb84dc55 --- /dev/null +++ b/resolve/cpu/CMakeLists.txt @@ -0,0 +1,25 @@ +#[[ + +@brief Build ReSolve backend when there is no GPU support + +@author Slaven Peles + +]] + +set(ReSolve_CPU_SRC + MemoryUtils.cpp +) + +set(ReSolve_CPU_HEADER_INSTALL + CpuMemory.hpp +) + +# First create dummy backend +add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC}) +target_include_directories(resolve_backend_cpu INTERFACE + $ + $ +) + +# install include headers +install(FILES ${ReSolve_CPU_HEADER_INSTALL} DESTINATION include/resolve/cpu) diff --git a/resolve/cpu/CpuMemory.hpp b/resolve/cpu/CpuMemory.hpp new file mode 100644 index 00000000..89a9982d --- /dev/null +++ b/resolve/cpu/CpuMemory.hpp @@ -0,0 +1,134 @@ +#pragma once + +#include + +namespace ReSolve +{ + namespace memory + { + /** + * @brief Class containing dummy functions when there is no GPU support. + * + * @author Slaven Peles + */ + struct Cpu + { + /** + * @brief Dummy function to stand in when GPU support is not enabled. + */ + static void deviceSynchronize() + { + // Nothing to synchronize + } + + /** + * @brief Dummy function to stand in when GPU support is not enabled. + * + * @return Allways return success! + */ + static int getLastDeviceError() + { + // not on device, nothing to get + return 0; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + static int deleteOnDevice(void* /* v */) + { + ReSolve::io::Logger::error() << "Trying to delete on a GPU device, but GPU support not available.\n"; + return -1; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + template + static int allocateArrayOnDevice(T** /* v */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to allocate on a GPU device, but GPU support not available.\n"; + return -1; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + template + static int allocateBufferOnDevice(T** /* v */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to allocate on a GPU device, but GPU support not available.\n"; + return -1; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + template + static int setZeroArrayOnDevice(T* /* v */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to initialize array on a GPU device, but GPU support not available.\n"; + return -1; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + template + static int copyArrayDeviceToHost(T* /* dst */, const T* /* src */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to copy from a GPU device, but GPU support not available.\n"; + return -1; + } + + /** + * @brief Dummy function to notify us something is wrong. + * + * This will be called only if GPU device support is not built, so + * trying to access a device should indicate a bug in the code. + * + * @return Allways return failure! + */ + template + static int copyArrayDeviceToDevice(T* /* dst */, const T* /* src */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to copy to a GPU device, but GPU support not available.\n"; + return -1; + } + + template + static int copyArrayHostToDevice(T* /* dst */, const T* /* src */, I /* n */) + { + ReSolve::io::Logger::error() << "Trying to copy to a GPU device, but GPU support not available.\n"; + return -1; + } + + }; // struct Cuda + } // namespace memory + +} //namespace ReSolve diff --git a/resolve/cpu/MemoryUtils.cpp b/resolve/cpu/MemoryUtils.cpp new file mode 100644 index 00000000..03e913bc --- /dev/null +++ b/resolve/cpu/MemoryUtils.cpp @@ -0,0 +1,40 @@ +/** + * @file MemoryUtils.cpp + * + * This file includes MemoryUtils.tpp and specifies what functions to + * instantiate from function templates. + * + * @author Slaven Peles + */ + + +#include + +#include +#include +#include + +#include + +namespace ReSolve +{ + template void MemoryUtils::deviceSynchronize(); + template int MemoryUtils::getLastDeviceError(); + template int MemoryUtils::deleteOnDevice(void*); + + template int MemoryUtils::allocateArrayOnDevice( real_type**, index_type); + template int MemoryUtils::allocateArrayOnDevice(index_type**, index_type); + + template int MemoryUtils::allocateBufferOnDevice(void** v, size_t n); + + template int MemoryUtils::setZeroArrayOnDevice( real_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToHost( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToHost(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToDevice(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayHostToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayHostToDevice(index_type*, const index_type*, index_type); +} diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt new file mode 100644 index 00000000..d3ead313 --- /dev/null +++ b/resolve/cuda/CMakeLists.txt @@ -0,0 +1,31 @@ +#[[ + +@brief Build ReSolve CUDA backend + +@author Slaven Peles + +]] + +set(ReSolve_CUDA_SRC + cudaKernels.cu + MemoryUtils.cu +) + +set(ReSolve_CUDA_HEADER_INSTALL + cudaKernels.h + CudaMemory.hpp + cuda_check_errors.hpp +) + +set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA) + +# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) +add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC}) +target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda) +target_include_directories(resolve_backend_cuda INTERFACE + $ + $ +) + +# install include headers +install(FILES ${ReSolve_CUDA_HEADER_INSTALL} DESTINATION include/resolve/cuda) diff --git a/resolve/cuda/CudaMemory.hpp b/resolve/cuda/CudaMemory.hpp new file mode 100644 index 00000000..a56ef37d --- /dev/null +++ b/resolve/cuda/CudaMemory.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include +#include + +#include "cuda_check_errors.hpp" + +namespace ReSolve +{ + namespace memory + { + /** + * @brief Class containing wrappers for CUDA API functions. + * + * All wrappers are implemented as static functions returning integer + * error code from CUDA API functions. + * + * @author Slaven Peles + */ + struct Cuda + { + static void deviceSynchronize() + { + cudaDeviceSynchronize(); + } + + static int getLastDeviceError() + { + return static_cast(cudaGetLastError()); + } + + /** + * @brief deletes variable from device + * + * @param v - a variable on the device + * + * @post v is freed from the device + */ + static int deleteOnDevice(void* v) + { + return checkCudaErrors(cudaFree(v)); + } + + /** + * @brief allocates array v onto device + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of array elements (int, size_t) + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post v is now a array with size n on the device + */ + template + static int allocateArrayOnDevice(T** v, I n) + { + return checkCudaErrors(cudaMalloc((void**) v, sizeof(T) * n)); + } + + /** + * @brief allocates buffer v onto device. + * + * The difference from the array is that buffer size is required in bytes, + * not number of elements. + * + * @param v - pointer to the buffer to be allocated on the device + * @param n - size of the buffer in bytes + * + * @tparam T - Buffer element data type type (typically void) + * @tparam I - Buffer size type (typically size_t) + * + * @post v is now a buffer of n bytes + */ + template + static int allocateBufferOnDevice(T** v, I n) + { + return checkCudaErrors(cudaMalloc((void**) v, n)); + } + + /** + * @brief Sets elements of device array v to zero + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of the array elements to be set to zero + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post First n elements of array v are set to zero + */ + template + static int setZeroArrayOnDevice(T* v, I n) + { + return checkCudaErrors(cudaMemset(v, 0, sizeof(T) * n)); + } + + /** + * @brief Copies array `src` from device to the array `dst` on the host. + * + * @param[in] n - size of src array + * @param[in] src - array on device + * @param[out] dst - array on host + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the host + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToHost(T* dst, const T* src, I n) + { + return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToHost)); + } + + /** + * @brief Copies array `src` to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on device to be copied + * @param dst - array on device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToDevice(T* dst, const T* src, I n) + { + return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToDevice)); + } + + /** + * @brief Copies array `src` from the host to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on the host to be copied + * @param dst - array on the device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the host + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayHostToDevice(T* dst, const T* src, I n) + { + return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyHostToDevice)); + } + + }; + } + +} //namespace ReSolve diff --git a/resolve/cuda/MemoryUtils.cu b/resolve/cuda/MemoryUtils.cu new file mode 100644 index 00000000..880bff1c --- /dev/null +++ b/resolve/cuda/MemoryUtils.cu @@ -0,0 +1,40 @@ +/** + * @file MemoryUtils.cu + * + * This file includes MemoryUtils.tpp and specifies what functions to + * instantiate from function templates. + * + * @author Slaven Peles + */ + + +#include + +#include +#include + +#include + +namespace ReSolve +{ + template void MemoryUtils::deviceSynchronize(); + template int MemoryUtils::getLastDeviceError(); + template int MemoryUtils::deleteOnDevice(void*); + + template int MemoryUtils::allocateArrayOnDevice( real_type**, index_type); + template int MemoryUtils::allocateArrayOnDevice(index_type**, index_type); + + template int MemoryUtils::allocateBufferOnDevice(void** v, size_t n); + + template int MemoryUtils::setZeroArrayOnDevice( real_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToHost( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToHost(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToDevice(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayHostToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayHostToDevice(index_type*, const index_type*, index_type); + +} //namespace ReSolve diff --git a/resolve/cudaKernels.cu b/resolve/cuda/cudaKernels.cu similarity index 100% rename from resolve/cudaKernels.cu rename to resolve/cuda/cudaKernels.cu diff --git a/resolve/cudaKernels.h b/resolve/cuda/cudaKernels.h similarity index 100% rename from resolve/cudaKernels.h rename to resolve/cuda/cudaKernels.h diff --git a/resolve/cuda_check_errors.hpp b/resolve/cuda/cuda_check_errors.hpp similarity index 66% rename from resolve/cuda_check_errors.hpp rename to resolve/cuda/cuda_check_errors.hpp index 47ad0a1e..00a2029e 100644 --- a/resolve/cuda_check_errors.hpp +++ b/resolve/cuda/cuda_check_errors.hpp @@ -1,3 +1,12 @@ +/** + * @file cuda_check_errors.hpp + * + * Contains macro to get error code from CUDA functions and to stream + * appropriate error output to Re::Solve's logger. + * + * @author Kasia Swirydowicz + * @author Slaven Peles + */ #pragma once #include diff --git a/resolve/hip/.gitkeep b/resolve/hip/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 142be6e9..89ac925d 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -20,9 +20,13 @@ set(Matrix_HEADER_INSTALL # Build shared library ReSolve::matrix add_library(resolve_matrix SHARED ${Matrix_SRC}) -target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) -#install(TARGETS resolve_matrix EXPORT ReSolveTargets) +if (RESOLVE_USE_CUDA) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) +else() + target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) +endif() + target_include_directories(resolve_matrix INTERFACE $ diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index 42454c34..c8caebf6 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -2,7 +2,6 @@ #include #include -#include #include "Coo.hpp" @@ -102,13 +101,13 @@ namespace ReSolve if (memspaceOut == "cuda") { //check if cuda data allocated if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); } if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } } @@ -122,25 +121,25 @@ namespace ReSolve owns_cpu_vals_ = true; break; case 2://cuda->cpu - copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); - copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); - copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; case 1://cpu->cuda - copyArrayHostToDevice(d_row_data_, row_data, nnz_current); - copyArrayHostToDevice(d_col_data_, col_data, nnz_current); - copyArrayHostToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; break; case 3://cuda->cuda - copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); - copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); - copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; @@ -178,9 +177,9 @@ namespace ReSolve } if (memspace == "cuda") { - allocateArrayOnDevice(&d_row_data_, nnz_current); - allocateArrayOnDevice(&d_col_data_, nnz_current); - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); owns_gpu_data_ = true; owns_gpu_vals_ = true; return 0; @@ -206,9 +205,9 @@ namespace ReSolve if (h_val_data_ == nullptr) { h_val_data_ = new real_type[nnz_current]; } - copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; @@ -219,17 +218,17 @@ namespace ReSolve if (memspaceOut == "cuda") { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); } if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } - copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index 9a40408b..1a305e03 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -1,6 +1,5 @@ #include // <-- includes memcpy -#include #include "Csc.hpp" namespace ReSolve @@ -98,13 +97,13 @@ namespace ReSolve if (memspaceOut == "cuda") { //check if cuda data allocated if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); } if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } } @@ -118,25 +117,25 @@ namespace ReSolve owns_cpu_vals_ = true; break; case 2://cuda->cpu - copyArrayDeviceToHost(h_col_data_, col_data, n_ + 1); - copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); - copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, col_data, n_ + 1); + mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; case 1://cpu->cuda - copyArrayHostToDevice(d_col_data_, col_data, n_ + 1); - copyArrayHostToDevice(d_row_data_, row_data, nnz_current); - copyArrayHostToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, col_data, n_ + 1); + mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; break; case 3://cuda->cuda - copyArrayDeviceToDevice(d_col_data_, col_data, n_ + 1); - copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); - copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_col_data_, col_data, n_ + 1); + mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; @@ -175,9 +174,9 @@ namespace ReSolve } if (memspace == "cuda") { - allocateArrayOnDevice(&d_col_data_, n_ + 1); - allocateArrayOnDevice(&d_row_data_, nnz_current); - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); owns_gpu_data_ = true; owns_gpu_vals_ = true; return 0; @@ -203,9 +202,9 @@ namespace ReSolve if (h_val_data_ == nullptr) { h_val_data_ = new real_type[nnz_current]; } - copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); - copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; @@ -216,17 +215,17 @@ namespace ReSolve if (memspaceOut == "cuda") { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); } if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } - copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); - copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index 9decfd6f..f1ddd31f 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -1,5 +1,5 @@ #include // <-- includes memcpy -#include + #include "Csr.hpp" namespace ReSolve @@ -97,13 +97,13 @@ namespace ReSolve if (memspaceOut == "cuda") { //check if cuda data allocated if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); } if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } } @@ -119,25 +119,25 @@ namespace ReSolve owns_cpu_vals_ = true; break; case 2://cuda->cpu - copyArrayDeviceToHost(h_row_data_, row_data, n_ + 1); - copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); - copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToHost(h_row_data_, row_data, n_ + 1); + mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; case 1://cpu->cuda - copyArrayHostToDevice(d_row_data_, row_data, n_ + 1); - copyArrayHostToDevice(d_col_data_, col_data, nnz_current); - copyArrayHostToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayHostToDevice(d_row_data_, row_data, n_ + 1); + mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; break; case 3://cuda->cuda - copyArrayDeviceToDevice(d_row_data_, row_data, n_ + 1); - copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); - copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_row_data_, row_data, n_ + 1); + mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); + mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; @@ -175,9 +175,9 @@ namespace ReSolve } if (memspace == "cuda") { - allocateArrayOnDevice(&d_row_data_, n_ + 1); - allocateArrayOnDevice(&d_col_data_, nnz_current); - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); owns_gpu_data_ = true; owns_gpu_vals_ = true; return 0; @@ -202,9 +202,9 @@ namespace ReSolve if (h_val_data_ == nullptr) { h_val_data_ = new real_type[nnz_current]; } - copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); - copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; @@ -215,17 +215,17 @@ namespace ReSolve if (memspaceOut == "cuda") { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { - allocateArrayOnDevice(&d_row_data_, n_ + 1); + mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); } if (d_col_data_ == nullptr) { - allocateArrayOnDevice(&d_col_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); } if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } - copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); - copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); d_data_updated_ = true; owns_gpu_data_ = true; owns_gpu_vals_ = true; diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index d240c77f..19119731 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -1,7 +1,6 @@ #include #include -#include #include #include #include @@ -290,8 +289,8 @@ namespace ReSolve { CUSPARSE_SPMV_CSR_ALG2, &bufferSize); error_sum += status; - deviceSynchronize(); - allocateBufferOnDevice(&buffer_spmv, bufferSize); + mem_.deviceSynchronize(); + mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize); workspaceCUDA->setSpmvMatrixDescriptor(matA); workspaceCUDA->setSpmvBuffer(buffer_spmv); @@ -309,10 +308,10 @@ namespace ReSolve { CUSPARSE_SPMV_CSR_ALG2, buffer_spmv); error_sum += status; - deviceSynchronize(); + mem_.deviceSynchronize(); if (status) out::error() << "Matvec status: " << status - << "Last error code: " << getLastDeviceError() << std::endl; + << "Last error code: " << mem_.getLastDeviceError() << std::endl; vec_result->setDataUpdated("cuda"); cusparseDestroyDnVec(vecx); @@ -388,7 +387,7 @@ namespace ReSolve { CUSPARSE_CSR2CSC_ALG1, &bufferSize); error_sum += status; - allocateBufferOnDevice(&d_work, bufferSize); + mem_.allocateBufferOnDevice(&d_work, bufferSize); status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(), n, m, @@ -406,7 +405,7 @@ namespace ReSolve { d_work); error_sum += status; return error_sum; - deleteOnDevice(d_work); + mem_.deleteOnDevice(d_work); } else { out::error() << "Not implemented (yet)" << std::endl; return -1; diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index fe061f64..366b5422 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -5,6 +5,7 @@ // (3) Matrix 1-norm #pragma once #include +#include namespace ReSolve { @@ -75,6 +76,8 @@ namespace ReSolve { LinAlgWorkspace* workspace_{nullptr}; bool new_matrix_{true}; ///< if the structure changed, you need a new handler. bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object }; } // namespace ReSolve diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp index d27cd7b1..5c866386 100644 --- a/resolve/matrix/Sparse.cpp +++ b/resolve/matrix/Sparse.cpp @@ -1,6 +1,6 @@ #include // <-- includes memcpy + #include "Sparse.hpp" -#include namespace ReSolve { namespace matrix { @@ -186,11 +186,11 @@ namespace ReSolve { namespace matrix { } else { if (memspace == "cuda"){ if (owns_gpu_data_) { - deleteOnDevice(d_row_data_); - deleteOnDevice(d_col_data_); + mem_.deleteOnDevice(d_row_data_); + mem_.deleteOnDevice(d_col_data_); } if (owns_gpu_vals_) { - deleteOnDevice(d_val_data_); + mem_.deleteOnDevice(d_val_data_); } } else { return -1; @@ -222,7 +222,7 @@ namespace ReSolve { namespace matrix { if (memspaceOut == "cuda") { //check if cuda data allocated if (d_val_data_ == nullptr) { - allocateArrayOnDevice(&d_val_data_, nnz_current); + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); } } @@ -233,17 +233,17 @@ namespace ReSolve { namespace matrix { owns_cpu_vals_ = true; break; case 2://cuda->cpu - copyArrayDeviceToHost(h_val_data_, new_vals, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, new_vals, nnz_current); h_data_updated_ = true; owns_cpu_vals_ = true; break; case 1://cpu->cuda - copyArrayHostToDevice(d_val_data_, new_vals, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, new_vals, nnz_current); d_data_updated_ = true; owns_gpu_vals_ = true; break; case 3://cuda->cuda - copyArrayDeviceToDevice(d_val_data_, new_vals, nnz_current); + mem_.copyArrayDeviceToDevice(d_val_data_, new_vals, nnz_current); d_data_updated_ = true; owns_gpu_vals_ = true; break; diff --git a/resolve/matrix/Sparse.hpp b/resolve/matrix/Sparse.hpp index c44eb03c..1196c38e 100644 --- a/resolve/matrix/Sparse.hpp +++ b/resolve/matrix/Sparse.hpp @@ -3,6 +3,7 @@ #pragma once #include #include +#include namespace ReSolve { namespace matrix { class Sparse @@ -86,5 +87,8 @@ namespace ReSolve { namespace matrix { bool owns_gpu_data_{false}; ///< for row/col data bool owns_gpu_vals_{false}; ///< for values + + MemoryHandler mem_; ///< Device memory manager object + }; }} // namespace ReSolve::matrix diff --git a/resolve/memoryUtils.cu b/resolve/memoryUtils.cu deleted file mode 100644 index f36668c3..00000000 --- a/resolve/memoryUtils.cu +++ /dev/null @@ -1,146 +0,0 @@ -#include - -#include -#include -#include "cuda_check_errors.hpp" - -namespace ReSolve -{ - void deviceSynchronize() - { - cudaDeviceSynchronize(); - } - - int getLastDeviceError() - { - return static_cast(cudaGetLastError()); - } - - /** - * @brief deletes variable from device - * - * @param v - a variable on the device - * - * @post v is freed from the device - */ - int deleteOnDevice(void* v) - { - return checkCudaErrors(cudaFree(v)); - } - - /** - * @brief allocates array v onto device - * - * @param v - pointer to the array to be allocated on the device - * @param n - number of array elements (int, size_t) - * - * @tparam T - Array element type - * @tparam I - Array index type - * - * @post v is now a array with size n on the device - */ - template - int allocateArrayOnDevice(T** v, I n) - { - return checkCudaErrors(cudaMalloc((void**) v, sizeof(T) * n)); - } - template int allocateArrayOnDevice( real_type**, index_type); - template int allocateArrayOnDevice(index_type**, index_type); - - /** - * @brief allocates buffer v onto device. - * - * The difference from the array is that buffer size is required in bytes, - * not number of elements. - * - * @param v - pointer to the buffer to be allocated on the device - * @param n - size of the buffer in bytes - * - * @tparam T - Buffer element data type type (typically void) - * @tparam I - Buffer size type (typically size_t) - * - * @post v is now a buffer of n bytes - */ - template - int allocateBufferOnDevice(T** v, I n) - { - return checkCudaErrors(cudaMalloc((void**) v, n)); - } - template int allocateBufferOnDevice(void** v, size_t n); - - /** - * @brief Sets elements of device array v to zero - * - * @param v - pointer to the array to be allocated on the device - * @param n - number of the array elements to be set to zero - * - * @tparam T - Array element type - * @tparam I - Array index type - * - * @post First n elements of array v are set to zero - */ - template - int setZeroArrayOnDevice(T* v, I n) - { - return checkCudaErrors(cudaMemset(v, 0, sizeof(T) * n)); - } - template int setZeroArrayOnDevice( real_type*, index_type); - - /** - * @brief Copies array `src` from device to the array `dst` on the host. - * - * @param[in] n - size of src array - * @param[in] src - array on device - * @param[out] dst - array on host - * - * @pre `src` is a pointer to an allocated array on the device - * @pre `dst` is allocated to size >= n on the host - * @post Content of `dst` is overwritten by the content of `src` - */ - template - int copyArrayDeviceToHost(T* dst, const T* src, I n) - { - return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToHost)); - } - template int copyArrayDeviceToHost( real_type*, const real_type*, index_type); - template int copyArrayDeviceToHost(index_type*, const index_type*, index_type); - - /** - * @brief Copies array `src` to the array `dst` on the device. - * - * @param n - size of src array - * @param src - array on device to be copied - * @param dst - array on device to be copied onto - * - * @pre `src` is a pointer to an allocated array on the device - * @pre `dst` is allocated to size >= n on the device - * @post Content of `dst` is overwritten by the content of `src` - */ - template - int copyArrayDeviceToDevice(T* dst, const T* src, I n) - { - return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToDevice)); - } - template int copyArrayDeviceToDevice( real_type*, const real_type*, index_type); - template int copyArrayDeviceToDevice(index_type*, const index_type*, index_type); - - /** - * @brief Copies array `src` from the host to the array `dst` on the device. - * - * @param n - size of src array - * @param src - array on the host to be copied - * @param dst - array on the device to be copied onto - * - * @pre `src` is a pointer to an allocated array on the host - * @pre `dst` is allocated to size >= n on the device - * @post Content of `dst` is overwritten by the content of `src` - */ - template - int copyArrayHostToDevice(T* dst, const T* src, I n) - { - return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyHostToDevice)); - } - template int copyArrayHostToDevice( real_type*, const real_type*, index_type); - template int copyArrayHostToDevice(index_type*, const index_type*, index_type); - -} //namespace ReSolve diff --git a/resolve/memoryUtils.hpp b/resolve/memoryUtils.hpp deleted file mode 100644 index 0dcab1ba..00000000 --- a/resolve/memoryUtils.hpp +++ /dev/null @@ -1,117 +0,0 @@ -#pragma once - -#include -#include - -namespace ReSolve -{ - void deviceSynchronize(); - - int getLastDeviceError(); - - - int deleteOnDevice(void* v); - - template - int allocateArrayOnDevice(T** v, I n); - - template - int allocateBufferOnDevice(T** v, I n); - - template - int setZeroArrayOnDevice(T* v, I n); - - template - int copyArrayDeviceToHost(T* dst, const T* src, I n); - - template - int copyArrayDeviceToDevice(T* dst, const T* src, I n); - - template - int copyArrayHostToDevice(T* dst, const T* src, I n); - - /** - * @brief copies a host array onto a newly allocated array on the device - * - * @param n - size of src array - * @param src - array on host to be cloned - * @param dst - array on device on which src is cloned - * - * @post dst is a clone of src on the device - */ - template - int cloneArrayHostToDevice(I n, T** src, T** dst) - { - allocateArrayOnDevice(n, dst); - copyArrayToDevice(n, *src, *dst); - return 0; - } - - /** - * @brief prints array from host - * - * @param v - array on host - * @param display_n - number of elements to print - * @param label - name of array - * - * @pre display_n <= number of elements in v - * @post display_n elements of v printed - */ - template - void displayHostarray(T* v, - I start_i, - I display_n, - std::string label = "array") - { - std::cout << "\n\n" << label << ": {"; - for(int i = start_i; i < start_i + display_n - 1; i++){ - std::cout << v[i] << ", "; - } - std::cout << v[display_n - 1] << "}\n" << std::endl; - } - - /** - * @brief prints array from device - * - * @param v - array on host - * @param display_n - number of elements to print - * @param n - number of elements in v - * @param label - name of array - * - * @pre display_n <= n - * @post display_n elements of v printed - */ - template - int displayDevicearray(T* v, - I n, - I start_i, - I display_n, - std::string label = "array") - { - T* h_v = new T[n]; - copyArrayDeviceToHost(n, v, h_v); - displayHostArray(h_v, start_i, display_n, label); - return 0; - } - - /** - * @brief clones array of size n from src to dst - * - * @param n - size of array - * @param src - array to be cloned - * @param dst - clone target - * - * @pre n contain an int length - * @pre src is a valid array - * - * @post dst is a clone of src on device - */ - template - int cloneArrayDeviceToDevice(int n, T** src, T** dst) - { - allocateArrayOnDevice(n, dst); - copyArrayDeviceToDevice(n, *src, *dst); - return 0; - } - -} // namespace ReSolve diff --git a/resolve/resolve_defs.hpp.in b/resolve/resolve_defs.hpp.in new file mode 100644 index 00000000..9756376c --- /dev/null +++ b/resolve/resolve_defs.hpp.in @@ -0,0 +1,16 @@ +#pragma once + +#cmakedefine RESOLVE_USE_GPU +#cmakedefine RESOLVE_USE_CUDA +#cmakedefine RESOLVE_USE_HIP +#cmakedefine RESOLVE_USE_MPI +#cmakedefine RESOLVE_USE_RAJA +#cmakedefine RESOLVE_USE_EIGEN +#cmakedefine RESOLVE_USE_KLU +#define RESOLVE_VERSION "@PROJECT_VERSION@" +// #define RESOLVE_VERSION_MAJOR @PROJECT_VERSION_MAJOR@ +// #define RESOLVE_VERSION_MINOR @PROJECT_VERSION_MINOR@ +// #define RESOLVE_VERSION_PATCH @PROJECT_VERSION_PATCH@ + +// /// Date of build with the format "%Y-%m-%d" +// #define RESOLVE_RELEASE_DATE "@RESOLVE_RELEASE_DATE@" diff --git a/resolve/utilities/logger/CMakeLists.txt b/resolve/utilities/logger/CMakeLists.txt index cb1a9d89..91b29dfc 100644 --- a/resolve/utilities/logger/CMakeLists.txt +++ b/resolve/utilities/logger/CMakeLists.txt @@ -22,4 +22,4 @@ target_include_directories(resolve_logger INTERFACE $ ) -install(FILES ${Matrix_HEADER_INSTALL} DESTINATION include/resolve/utilities/logger) +install(FILES ${Logger_HEADER_INSTALL} DESTINATION include/resolve/utilities/logger) diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt index 37e5564b..57feb5df 100644 --- a/resolve/vector/CMakeLists.txt +++ b/resolve/vector/CMakeLists.txt @@ -15,13 +15,17 @@ set(Vector_HEADER_INSTALL VectorHandler.hpp ) -set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA) - -# Build shared library ReSolve::vector -add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA}) -target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) - -#install(TARGETS resolve_vector EXPORT ReSolveTargets) +if (RESOLVE_USE_CUDA) + set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA) + + # Build shared library ReSolve::vector + add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA}) + target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) +else() + # Build shared library ReSolve::vector + add_library(resolve_vector SHARED ${Vector_SRC}) + target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) +endif() target_include_directories(resolve_vector INTERFACE $ diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index e612794c..7934e8b0 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -1,5 +1,4 @@ #include -#include #include #include @@ -34,7 +33,7 @@ namespace ReSolve { namespace vector { Vector::~Vector() { if (owns_cpu_data_) delete [] h_data_; - if (owns_gpu_data_) deleteOnDevice(d_data_); + if (owns_gpu_data_) mem_.deleteOnDevice(d_data_); } @@ -100,7 +99,7 @@ namespace ReSolve { namespace vector { } if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ //allocate first - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } switch(control) { @@ -111,19 +110,19 @@ namespace ReSolve { namespace vector { gpu_updated_ = false; break; case 2: //cuda->cpu - copyArrayDeviceToHost(h_data_, data, n_current_ * k_); + mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_); owns_gpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; break; case 1: //cpu->cuda - copyArrayHostToDevice(d_data_, data, n_current_ * k_); + mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; cpu_updated_ = false; break; case 3: //cuda->cuda - copyArrayDeviceToDevice(d_data_, data, n_current_ * k_); + mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; cpu_updated_ = false; @@ -174,16 +173,16 @@ namespace ReSolve { namespace vector { } if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ //allocate first - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } switch(control) { case 0: //cpu->cuda - copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_); + mem_.copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_); owns_gpu_data_ = true; break; case 1: //cuda->cpu - copyArrayDeviceToHost(h_data_, d_data_, n_current_ * k_); + mem_.copyArrayDeviceToHost(h_data_, d_data_, n_current_ * k_); owns_cpu_data_ = true; break; default: @@ -202,8 +201,8 @@ namespace ReSolve { namespace vector { owns_cpu_data_ = true; } else { if (memspace == "cuda") { - deleteOnDevice(d_data_); - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.deleteOnDevice(d_data_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } } @@ -223,10 +222,10 @@ namespace ReSolve { namespace vector { } else { if (memspace == "cuda") { if (d_data_ == nullptr) { - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } - setZeroArrayOnDevice(d_data_, n_ * k_); + mem_.setZeroArrayOnDevice(d_data_, n_ * k_); } } } @@ -244,11 +243,11 @@ namespace ReSolve { namespace vector { } else { if (memspace == "cuda") { if (d_data_ == nullptr) { - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } // TODO: We should not need to access raw data in this class - setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_); + mem_.setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_); } } } @@ -266,7 +265,7 @@ namespace ReSolve { namespace vector { } else { if (memspace == "cuda") { if (d_data_ == nullptr) { - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } set_array_const(n_ * k_, C, d_data_); @@ -287,7 +286,7 @@ namespace ReSolve { namespace vector { } else { if (memspace == "cuda") { if (d_data_ == nullptr) { - allocateArrayOnDevice(&d_data_, n_ * k_); + mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } set_array_const(n_current_ * 1, C, &d_data_[n_current_ * j]); @@ -324,7 +323,7 @@ namespace ReSolve { namespace vector { std::memcpy(dest, data, n_current_ * sizeof(real_type)); } else { if (memspaceOut == "cuda") { - copyArrayDeviceToDevice(dest, data, n_current_); + mem_.copyArrayDeviceToDevice(dest, data, n_current_); } else { //error } @@ -340,7 +339,7 @@ namespace ReSolve { namespace vector { std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type)); } else { if (memspaceOut == "cuda") { - copyArrayDeviceToDevice(dest, data, n_current_ * k_); + mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_); } else { //error } diff --git a/resolve/vector/Vector.hpp b/resolve/vector/Vector.hpp index c4d9ad80..9d1bd452 100644 --- a/resolve/vector/Vector.hpp +++ b/resolve/vector/Vector.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace ReSolve { namespace vector { class Vector @@ -42,5 +43,7 @@ namespace ReSolve { namespace vector { bool owns_gpu_data_{false}; bool owns_cpu_data_{false}; + + MemoryHandler mem_; ///< Device memory manager object }; }} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp index f695f533..032cc9b1 100644 --- a/resolve/vector/VectorHandler.cpp +++ b/resolve/vector/VectorHandler.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include "VectorHandler.hpp" diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 62841b10..ae24f9db 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,6 +5,10 @@ @author Slaven Peles ]] -add_subdirectory(functionality) + +if(RESOLVE_USE_CUDA) + add_subdirectory(functionality) +endif(RESOLVE_USE_CUDA) + add_subdirectory(unit) diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f91c2ff7..d61250fe 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -6,6 +6,8 @@ ]] -add_subdirectory(matrix) -add_subdirectory(vector) -add_subdirectory(utilities) +if(RESOLVE_USE_CUDA) + add_subdirectory(matrix) + add_subdirectory(vector) + add_subdirectory(utilities) +endif(RESOLVE_USE_CUDA) diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp index 2ed889db..6f0d305b 100644 --- a/tests/unit/vector/GramSchmidtTests.hpp +++ b/tests/unit/vector/GramSchmidtTests.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -145,8 +146,9 @@ namespace ReSolve { << " Inner product computed: " << ip << ", expected: " << 0.0 << "\n"; break; } - if ( (i == j) && (abs(sqrt(ip)) != 1.0)) { + if ( (i == j) && !isEqual(abs(sqrt(ip)), 1.0)) { status = false; + std::cout << std::setprecision(16); std::cout << "Vector " << i << " has norm: " << sqrt(ip) << " expected: "<< 1.0 <<"\n"; break; }