From ea279a898c40fbad9d5a9adfb89b4bfd52a6854f Mon Sep 17 00:00:00 2001 From: Slaven Peles Date: Wed, 25 Oct 2023 22:18:26 -0400 Subject: [PATCH 01/12] First attempt at HIP implemetation --- CMakeLists.txt | 27 +++- cmake/ReSolveFindHipLibraries.cmake | 21 +++ examples/r_KLU_GLU.cpp | 7 +- examples/r_KLU_GLU_matrix_values_update.cpp | 7 +- examples/r_KLU_KLU.cpp | 7 +- examples/r_KLU_KLU_standalone.cpp | 7 +- examples/r_KLU_rf.cpp | 7 +- examples/r_KLU_rf_FGMRES.cpp | 3 + .../r_KLU_rf_FGMRES_reuse_factorization.cpp | 3 + resolve/CMakeLists.txt | 13 ++ resolve/MemoryUtils.hpp | 3 +- resolve/hip/CMakeLists.txt | 37 +++++ resolve/hip/HipMemory.hpp | 152 ++++++++++++++++++ resolve/hip/MemoryUtils.hip | 40 +++++ resolve/hip/hipVectorKernels.h | 57 +++++++ resolve/hip/hipVectorKernels.hip | 29 ++++ resolve/hip/hip_check_errors.hpp | 28 ++++ resolve/resolve_defs.hpp.in | 14 +- tests/functionality/testKLU.cpp | 5 +- tests/functionality/testKLU_GLU.cpp | 5 +- tests/functionality/testKLU_Rf.cpp | 5 +- tests/functionality/testKLU_Rf_FGMRES.cpp | 5 +- tests/unit/CMakeLists.txt | 1 + tests/unit/memory/CMakeLists.txt | 21 +++ tests/unit/memory/MemoryUtilsTests.hpp | 110 +++++++++++++ tests/unit/memory/runMemoryUtilsTests.cpp | 36 +++++ 26 files changed, 620 insertions(+), 30 deletions(-) create mode 100644 cmake/ReSolveFindHipLibraries.cmake create mode 100644 resolve/hip/CMakeLists.txt create mode 100644 resolve/hip/HipMemory.hpp create mode 100644 resolve/hip/MemoryUtils.hip create mode 100644 resolve/hip/hipVectorKernels.h create mode 100644 resolve/hip/hipVectorKernels.hip create mode 100644 resolve/hip/hip_check_errors.hpp create mode 100644 tests/unit/memory/CMakeLists.txt create mode 100644 tests/unit/memory/MemoryUtilsTests.hpp create mode 100644 tests/unit/memory/runMemoryUtilsTests.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 13d65cfa..9f802231 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,14 +23,17 @@ endif() option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF) option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON) -option(RESOLVE_USE_GPU "Use GPU device for computations" ON) -option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON) +option(RESOLVE_USE_GPU "Use GPU device for computations" OFF) +option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF) +option(RESOLVE_USE_HIP "Use HIP language and ROCm library" OFF) set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") if(RESOLVE_USE_CUDA) - set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE) -else() - set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE) + set(RESOLVE_USE_GPU On CACHE BOOL "Using CUDA GPU!" FORCE) +endif() + +if(RESOLVE_USE_HIP) + set(RESOLVE_USE_GPU On CACHE BOOL "Using HIP GPU!" FORCE) endif() @@ -89,6 +92,20 @@ else() message(STATUS "Not using CUDA") endif() +if(RESOLVE_USE_HIP) + enable_language(HIP) + check_language(HIP) + include(ReSolveFindHipLibraries) + + # This is just an agly hack to make HIP build work + get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) + message(STATUS "HIP include directories: ${hip_includes}") + include_directories(${hip_includes}) +else() + message(STATUS "Not using HIP") +endif(RESOLVE_USE_HIP) + + # The binary dir is already a global include directory configure_file( ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake new file mode 100644 index 00000000..83b7c220 --- /dev/null +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -0,0 +1,21 @@ +# Exports target `resolve_hip` which finds all hip libraries needed by resolve. + + +add_library(resolve_hip INTERFACE) + +find_package(hip REQUIRED) +find_package(hipblas REQUIRED) + +target_link_libraries(resolve_hip INTERFACE + #hip::host + hip::device + #roc::hipblas +) + +# get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) +# message(STATUS "HIP include directories: ${hip_includes}") + +# get_target_property(resolve_hip_includes resolve_hip INTERFACE_INCLUDE_DIRECTORIES) +# message(STATUS "ReSolve HIP include directories: ${resolve_hip_includes}") + +install(TARGETS resolve_hip EXPORT ReSolveTargets) diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp index e2cbfde4..e7b19f4e 100644 --- a/examples/r_KLU_GLU.cpp +++ b/examples/r_KLU_GLU.cpp @@ -41,8 +41,8 @@ int main(int argc, char *argv[]) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -159,7 +159,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp index 7d1bb141..ee99f0a0 100644 --- a/examples/r_KLU_GLU_matrix_values_update.cpp +++ b/examples/r_KLU_GLU_matrix_values_update.cpp @@ -44,8 +44,8 @@ int main(int argc, char *argv[]) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -170,7 +170,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp index 8b0ea59a..b9328e8a 100644 --- a/examples/r_KLU_KLU.cpp +++ b/examples/r_KLU_KLU.cpp @@ -40,8 +40,8 @@ int main(int argc, char *argv[]) ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -148,7 +148,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp index 77e5b97a..0b8f6114 100644 --- a/examples/r_KLU_KLU_standalone.cpp +++ b/examples/r_KLU_KLU_standalone.cpp @@ -36,8 +36,8 @@ int main(int argc, char *argv[]) ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -111,7 +111,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp index 01fa0f3c..7369af18 100644 --- a/examples/r_KLU_rf.cpp +++ b/examples/r_KLU_rf.cpp @@ -42,8 +42,8 @@ int main(int argc, char *argv[] ) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -173,7 +173,8 @@ int main(int argc, char *argv[] ) delete A; delete KLU; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp index ee674869..07839cbb 100644 --- a/examples/r_KLU_rf_FGMRES.cpp +++ b/examples/r_KLU_rf_FGMRES.cpp @@ -189,5 +189,8 @@ int main(int argc, char *argv[]) } // for (int i = 0; i < numSystems; ++i) + delete [] x; + delete [] rhs; + return 0; } diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp index 6a520a7a..56ab43fe 100644 --- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp +++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp @@ -217,5 +217,8 @@ int main(int argc, char *argv[]) } + delete [] x; + delete [] rhs; + return 0; } diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 8dbcc467..fa6c9cd5 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -48,6 +48,10 @@ if(RESOLVE_USE_CUDA) add_subdirectory(cuda) endif() +if(RESOLVE_USE_HIP) + add_subdirectory(hip) +endif() + # Now, build workspaces add_subdirectory(workspace) @@ -67,6 +71,10 @@ if(RESOLVE_USE_CUDA) target_link_libraries(resolve_tpl INTERFACE resolve_cuda) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + target_link_libraries(resolve_tpl INTERFACE resolve_hip) +endif(RESOLVE_USE_HIP) + set(ReSolve_Targets_List resolve_matrix @@ -82,6 +90,11 @@ if(RESOLVE_USE_CUDA) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) endif() +# If HIP support is enabled add HIP SDK specific code and dependencies +if(RESOLVE_USE_HIP) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip) +endif() + # If no GPU support is enabled, link to dummy device backend if(NOT RESOLVE_USE_GPU) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 00f3d653..976279d9 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -55,7 +55,8 @@ namespace ReSolve #include using MemoryHandler = ReSolve::MemoryUtils; #elif defined RESOLVE_USE_HIP -#error HIP support requested, but not available! Probably a bug in CMake configuration. +#include +using MemoryHandler = ReSolve::MemoryUtils; #else #error Unrecognized device, probably bug in CMake configuration #endif diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt new file mode 100644 index 00000000..f0a93b04 --- /dev/null +++ b/resolve/hip/CMakeLists.txt @@ -0,0 +1,37 @@ +#[[ + +@brief Build ReSolve HIP backend + +@author Slaven Peles + +]] + +set(ReSolve_HIP_SRC + # hipKernels.cu + hipVectorKernels.hip + MemoryUtils.hip +) + +set(ReSolve_HIP_HEADER_INSTALL + # hipKernels.h + # hipVectorKernels.h + HipMemory.hpp + # hip_check_errors.hpp +) + +set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP) + +# First create HIP backend +# (this should really be HIP _API_ backend, +# separate backend will be needed for HIP SDK) +add_library(resolve_backend_hip SHARED ${ReSolve_HIP_SRC}) +target_link_libraries(resolve_backend_hip PRIVATE resolve_logger) +target_link_libraries(resolve_backend_hip PUBLIC resolve_hip) +#target_include_directories(resolve_backend_hip PUBLIC ${hip_includes}) +target_include_directories(resolve_backend_hip INTERFACE + $ + $ +) + +# install include headers +install(FILES ${ReSolve_HIP_HEADER_INSTALL} DESTINATION include/resolve/hip) diff --git a/resolve/hip/HipMemory.hpp b/resolve/hip/HipMemory.hpp new file mode 100644 index 00000000..a6a482a5 --- /dev/null +++ b/resolve/hip/HipMemory.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include +#include + +#include "hip_check_errors.hpp" + +namespace ReSolve +{ + namespace memory + { + /** + * @brief Class containing wrappers for CUDA API functions. + * + * All wrappers are implemented as static functions returning integer + * error code from CUDA API functions. + * + * @author Slaven Peles + */ + struct Hip + { + static void deviceSynchronize() + { + hipDeviceSynchronize(); + } + + static int getLastDeviceError() + { + return static_cast(hipGetLastError()); + } + + /** + * @brief deletes variable from device + * + * @param v - a variable on the device + * + * @post v is freed from the device + */ + static int deleteOnDevice(void* v) + { + return checkHipErrors(hipFree(v)); + } + + /** + * @brief allocates array v onto device + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of array elements (int, size_t) + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post v is now a array with size n on the device + */ + template + static int allocateArrayOnDevice(T** v, I n) + { + return checkHipErrors(hipMalloc((void**) v, sizeof(T) * n)); + } + + /** + * @brief allocates buffer v onto device. + * + * The difference from the array is that buffer size is required in bytes, + * not number of elements. + * + * @param v - pointer to the buffer to be allocated on the device + * @param n - size of the buffer in bytes + * + * @tparam T - Buffer element data type type (typically void) + * @tparam I - Buffer size type (typically size_t) + * + * @post v is now a buffer of n bytes + */ + template + static int allocateBufferOnDevice(T** v, I n) + { + return checkHipErrors(hipMalloc((void**) v, n)); + } + + /** + * @brief Sets elements of device array v to zero + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of the array elements to be set to zero + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post First n elements of array v are set to zero + */ + template + static int setZeroArrayOnDevice(T* v, I n) + { + return checkHipErrors(hipMemset(v, 0, sizeof(T) * n)); + } + + /** + * @brief Copies array `src` from device to the array `dst` on the host. + * + * @param[in] n - size of src array + * @param[in] src - array on device + * @param[out] dst - array on host + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the host + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToHost(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToHost)); + } + + /** + * @brief Copies array `src` to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on device to be copied + * @param dst - array on device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToDevice(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToDevice)); + } + + /** + * @brief Copies array `src` from the host to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on the host to be copied + * @param dst - array on the device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the host + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayHostToDevice(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyHostToDevice)); + } + + }; + } + +} //namespace ReSolve diff --git a/resolve/hip/MemoryUtils.hip b/resolve/hip/MemoryUtils.hip new file mode 100644 index 00000000..bd3c666d --- /dev/null +++ b/resolve/hip/MemoryUtils.hip @@ -0,0 +1,40 @@ +/** + * @file MemoryUtils.cu + * + * This file includes MemoryUtils.tpp and specifies what functions to + * instantiate from function templates. + * + * @author Slaven Peles + */ + + +#include + +#include +#include + +#include + +namespace ReSolve +{ + template void MemoryUtils::deviceSynchronize(); + template int MemoryUtils::getLastDeviceError(); + template int MemoryUtils::deleteOnDevice(void*); + + template int MemoryUtils::allocateArrayOnDevice( real_type**, index_type); + template int MemoryUtils::allocateArrayOnDevice(index_type**, index_type); + + template int MemoryUtils::allocateBufferOnDevice(void** v, size_t n); + + template int MemoryUtils::setZeroArrayOnDevice( real_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToHost( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToHost(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToDevice(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayHostToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayHostToDevice(index_type*, const index_type*, index_type); + +} //namespace ReSolve diff --git a/resolve/hip/hipVectorKernels.h b/resolve/hip/hipVectorKernels.h new file mode 100644 index 00000000..cd23f822 --- /dev/null +++ b/resolve/hip/hipVectorKernels.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +#include + +//***************************************************************************// +//**** See VectorKernels.hpp for kernel wrapper functions documentation ****// +//***************************************************************************// + +namespace ReSolve { namespace vector { + +namespace kernels { + // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*); + + // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, index_type*, index_type*, real_type*); + + // __global__ void add_const(index_type, index_type, index_type*); + + /** + * @brief CUDA kernel that sets values of an array to a constant. + * + * @param[in] n - length of the array + * @param[in] val - the value the array is set to + * @param[out] arr - a pointer to the array + * + * @pre `arr` is allocated to size `n` + * @post `arr` elements are set to `val` + */ + __global__ void set_const(index_type n, real_type val, real_type* arr); + + // __global__ void add_vecs(index_type, real_type*, real_type, real_type*); + + // __global__ void mult_const(index_type, real_type, real_type*); + + // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*); + + // __global__ void inv_vec_scale(index_type, real_type*, real_type*); + + // __global__ void vec_scale(index_type, real_type*, real_type*); + + // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*, + // real_type*, index_type*, index_type*, real_type*, index_type*, index_type*); + + // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*, + // real_type*, real_type*); + + // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, real_type*, real_type*, index_type); + + // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*, + // real_type* scale); +} // namespace kernels + +}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip new file mode 100644 index 00000000..3df2b84b --- /dev/null +++ b/resolve/hip/hipVectorKernels.hip @@ -0,0 +1,29 @@ +#include +#include + +#include "hipVectorKernels.h" + +namespace ReSolve { namespace vector { + +namespace kernels { + +__global__ void set_const(index_type n, real_type val, real_type* arr) +{ + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + if(i < n) + { + arr[i] = val; + } +} + +} // namespace kernels + +void set_array_const(index_type n, real_type val, real_type* arr) +{ + index_type num_blocks; + index_type block_size = 512; + num_blocks = (n + block_size - 1) / block_size; + kernels::set_const<<>>(n, val, arr); +} + +}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/hip/hip_check_errors.hpp b/resolve/hip/hip_check_errors.hpp new file mode 100644 index 00000000..1f483d35 --- /dev/null +++ b/resolve/hip/hip_check_errors.hpp @@ -0,0 +1,28 @@ +/** + * @file hip_check_errors.hpp + * + * Contains macro to get error code from CUDA functions and to stream + * appropriate error output to Re::Solve's logger. + * + * @author Kasia Swirydowicz + * @author Slaven Peles + */ +#pragma once + +#include + +template +int check(T result, + char const *const func, + const char *const file, + int const line) +{ + if (result) { + ReSolve::io::Logger::error() << "HIP error in function " + << func << " at " << file << ":" << line + << ", error# " << result << "\n"; + return -1; + } + return 0; +} +#define checkHipErrors(val) check((val), #val, __FILE__, __LINE__) diff --git a/resolve/resolve_defs.hpp.in b/resolve/resolve_defs.hpp.in index 9756376c..15cd5791 100644 --- a/resolve/resolve_defs.hpp.in +++ b/resolve/resolve_defs.hpp.in @@ -1,4 +1,7 @@ -#pragma once +// #pragma once + +#ifndef __RESOLVE_DEFINITIONS_HPP__ +#define __RESOLVE_DEFINITIONS_HPP__ #cmakedefine RESOLVE_USE_GPU #cmakedefine RESOLVE_USE_CUDA @@ -14,3 +17,12 @@ // /// Date of build with the format "%Y-%m-%d" // #define RESOLVE_RELEASE_DATE "@RESOLVE_RELEASE_DATE@" + +#ifdef RESOLVE_USE_HIP +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ +#endif +#endif + + +#endif // __RESOLVE_DEFINITIONS_HPP__ \ No newline at end of file diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp index f3c1da57..b067f417 100644 --- a/tests/functionality/testKLU.cpp +++ b/tests/functionality/testKLU.cpp @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -215,7 +215,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp index 0e9bb4bd..ddaf3b31 100644 --- a/tests/functionality/testKLU_GLU.cpp +++ b/tests/functionality/testKLU_GLU.cpp @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vec_x->allocate("cpu");//for KLU @@ -239,7 +239,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp index 729968f5..124f07de 100644 --- a/tests/functionality/testKLU_Rf.cpp +++ b/tests/functionality/testKLU_Rf.cpp @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -243,7 +243,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp index a474e406..6601a3ee 100644 --- a/tests/functionality/testKLU_Rf_FGMRES.cpp +++ b/tests/functionality/testKLU_Rf_FGMRES.cpp @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -264,7 +264,8 @@ int main(int argc, char *argv[]) delete GS; delete FGMRES; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f91c2ff7..a8586342 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -9,3 +9,4 @@ add_subdirectory(matrix) add_subdirectory(vector) add_subdirectory(utilities) +add_subdirectory(memory) diff --git a/tests/unit/memory/CMakeLists.txt b/tests/unit/memory/CMakeLists.txt new file mode 100644 index 00000000..01313e33 --- /dev/null +++ b/tests/unit/memory/CMakeLists.txt @@ -0,0 +1,21 @@ +#[[ + +@brief Build ReSolve memory utilities unit tests + +@author Slaven Peles + +]] + +# Build memory utilities tests +add_executable(runMemoryUtilsTests.exe runMemoryUtilsTests.cpp) +target_link_libraries(runMemoryUtilsTests.exe PRIVATE ReSolve) +message(STATUS "Resolve libraries: ${resolve_backend_hip}") + + +# Install tests +set(installable_tests runMemoryUtilsTests.exe) +install(TARGETS ${installable_tests} + RUNTIME DESTINATION bin/resolve/tests/unit) + +# Add tests to run +add_test(NAME memory_test COMMAND $) diff --git a/tests/unit/memory/MemoryUtilsTests.hpp b/tests/unit/memory/MemoryUtilsTests.hpp new file mode 100644 index 00000000..4cc1ace8 --- /dev/null +++ b/tests/unit/memory/MemoryUtilsTests.hpp @@ -0,0 +1,110 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace ReSolve { namespace tests { + +/** + * @class Unit tests for memory handler class + */ +class MemoryUtilsTests : TestBase +{ +public: + MemoryUtilsTests(std::string memspace) : memspace_(memspace) + {} + virtual ~MemoryUtilsTests() + {} + + TestOutcome allocateAndDelete() + { + TestStatus status; + status = true; + + MemoryHandler mh; + + index_type n = 1000; + size_t m = 8000; + index_type* i = nullptr; + real_type* r = nullptr; + + mh.allocateArrayOnDevice(&i, n); + mh.allocateBufferOnDevice((void**) &r, m); + + status *= (i != nullptr); + status *= (r != nullptr); + + mh.deleteOnDevice(i); + mh.deleteOnDevice(r); + + return status.report(__func__); + } + + TestOutcome memsetAndMemcpy() + { + TestStatus status; + status = true; + + MemoryHandler mh; + + index_type n = 10; + + real_type zero = 0.0; + real_type minusone = -1.0; + + // Create raw arrays on the host and set their elements to -1 + real_type* array1 = new real_type[n]{0}; + real_type* array2 = new real_type[n]{0}; + std::fill_n(array1, n, minusone); + std::fill_n(array2, n, minusone); + + // Allocate arrays of size n on the device + real_type* devarray1 = nullptr; + real_type* devarray2 = nullptr; + mh.allocateArrayOnDevice(&devarray1, n); + mh.allocateArrayOnDevice(&devarray2, n); + + // Set devarray1 elements to 0 and copy it to array1 + mh.setZeroArrayOnDevice(devarray1, n); + mh.copyArrayDeviceToHost(array1, devarray1, n); + status *= verifyAnswer(array1, zero, n); + + // Copy array2 (values -1) to devarray2 and then devarray2 to array1 + mh.copyArrayHostToDevice(devarray2, array2, n); + mh.copyArrayDeviceToHost(array1, devarray2, n); + status *= verifyAnswer(array1, minusone, n); + + // Copy devarray1 (values 0) to devarray2 and then to array2 + mh.copyArrayDeviceToDevice(devarray2, devarray1, n); + mh.copyArrayDeviceToHost(array2, devarray2, n); + status *= verifyAnswer(array2, zero, n); + + return status.report(__func__); + } + + +private: + std::string memspace_{"cpu"}; + + bool verifyAnswer(real_type* x, real_type answer, index_type n) + { + bool status = true; + + for (index_type i = 0; i < n; ++i) { + if (!isEqual(x[i], answer)) { + status = false; + std::cout << "Solution vector element x[" << i << "] = " << x[i] + << ", expected: " << answer << "\n"; + break; + } + } + return status; + } + +}; // class MemoryUtilsTests + +}} // namespace ReSolve::tests diff --git a/tests/unit/memory/runMemoryUtilsTests.cpp b/tests/unit/memory/runMemoryUtilsTests.cpp new file mode 100644 index 00000000..00349c7c --- /dev/null +++ b/tests/unit/memory/runMemoryUtilsTests.cpp @@ -0,0 +1,36 @@ +#include +#include +#include + +#include "MemoryUtilsTests.hpp" + +int main(int, char**) +{ + ReSolve::tests::TestingResults result; + +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running memory tests with HIP backend:\n"; + ReSolve::tests::MemoryUtilsTests test("hip"); + + result += test.allocateAndDelete(); + result += test.memsetAndMemcpy(); + + std::cout << "\n"; + } +#endif + +#ifdef RESOLVE_USE_CUDA + { + std::cout << "Running memory tests with CUDA backend:\n"; + ReSolve::tests::MemoryUtilsTests test("hip"); + + result += test.allocateAndDelete(); + result += test.memsetAndMemcpy(); + + std::cout << "\n"; + } +#endif + + return result.summary(); +} From 3b1b78216bb9bf4579d9166bcd19bdeb45c9842b Mon Sep 17 00:00:00 2001 From: pelesh Date: Fri, 27 Oct 2023 21:56:34 -0400 Subject: [PATCH 02/12] First stab at hip linear algebra. (#39) Co-authored-by: kswirydo --- cmake/ReSolveFindHipLibraries.cmake | 2 + resolve/hip/CMakeLists.txt | 4 +- resolve/hip/hipKernels.h | 14 ++ resolve/hip/hipKernels.hip | 167 ++++++++++++++ resolve/hip/hipVectorKernels.hip | 29 ++- resolve/matrix/CMakeLists.txt | 13 ++ resolve/matrix/Coo.cpp | 30 +-- resolve/matrix/Csc.cpp | 30 +-- resolve/matrix/Csr.cpp | 30 +-- resolve/matrix/MatrixHandler.cpp | 30 +++ resolve/matrix/MatrixHandler.hpp | 4 + resolve/matrix/MatrixHandlerHip.cpp | 154 +++++++++++++ resolve/matrix/MatrixHandlerHip.hpp | 60 +++++ resolve/vector/CMakeLists.txt | 16 ++ resolve/vector/Vector.cpp | 48 ++-- resolve/vector/VectorHandler.cpp | 52 ++++- resolve/vector/VectorHandler.hpp | 4 + resolve/vector/VectorHandlerHip.cpp | 236 ++++++++++++++++++++ resolve/vector/VectorHandlerHip.hpp | 57 +++++ resolve/workspace/CMakeLists.txt | 13 ++ resolve/workspace/LinAlgWorkspace.hpp | 4 + resolve/workspace/LinAlgWorkspaceHIP.cpp | 75 +++++++ resolve/workspace/LinAlgWorkspaceHIP.hpp | 52 +++++ tests/unit/matrix/CMakeLists.txt | 2 +- tests/unit/matrix/MatrixHandlerTests.hpp | 9 +- tests/unit/matrix/runMatrixHandlerTests.cpp | 12 + tests/unit/vector/VectorHandlerTests.hpp | 12 +- tests/unit/vector/runVectorHandlerTests.cpp | 17 ++ 28 files changed, 1077 insertions(+), 99 deletions(-) create mode 100644 resolve/hip/hipKernels.h create mode 100644 resolve/hip/hipKernels.hip create mode 100644 resolve/matrix/MatrixHandlerHip.cpp create mode 100644 resolve/matrix/MatrixHandlerHip.hpp create mode 100644 resolve/vector/VectorHandlerHip.cpp create mode 100644 resolve/vector/VectorHandlerHip.hpp create mode 100644 resolve/workspace/LinAlgWorkspaceHIP.cpp create mode 100644 resolve/workspace/LinAlgWorkspaceHIP.hpp diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake index 83b7c220..e754da0d 100644 --- a/cmake/ReSolveFindHipLibraries.cmake +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -9,6 +9,8 @@ find_package(hipblas REQUIRED) target_link_libraries(resolve_hip INTERFACE #hip::host hip::device + rocblas + rocsparse #roc::hipblas ) diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt index f0a93b04..f8d7a457 100644 --- a/resolve/hip/CMakeLists.txt +++ b/resolve/hip/CMakeLists.txt @@ -7,14 +7,14 @@ ]] set(ReSolve_HIP_SRC - # hipKernels.cu + hipKernels.hip hipVectorKernels.hip MemoryUtils.hip ) set(ReSolve_HIP_HEADER_INSTALL # hipKernels.h - # hipVectorKernels.h + hipVectorKernels.h HipMemory.hpp # hip_check_errors.hpp ) diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h new file mode 100644 index 00000000..9c48783a --- /dev/null +++ b/resolve/hip/hipKernels.h @@ -0,0 +1,14 @@ +void mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result); +void mass_axpy(int n, int i, double* x, double* y, double* alpha); + +//needed for matrix inf nrm +void matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip new file mode 100644 index 00000000..13f53d85 --- /dev/null +++ b/resolve/hip/hipKernels.hip @@ -0,0 +1,167 @@ +#include "hipKernels.h" +#define maxk 1024 +#define Tv5 1024 + +#include + +//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 +__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, + double* result, + const int k, + const int N) +{ + int t = threadIdx.x; + int bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ double s_tmp1[Tv5]; + + volatile __shared__ double s_tmp2[Tv5]; + // map between thread index space and the problem index space + int j = blockIdx.x; + s_tmp1[t] = 0.0f; + s_tmp2[t] = 0.0f; + int nn = t; + double can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } +} + + +//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + +__global__ void massAxpy3_kernel(int N, + int k, + const double* x_data, + double* y_data, + const double* alpha) { + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int t = threadIdx.x; + + __shared__ double s_alpha[maxk]; + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + while (i < N){ + double temp = 0.0; + for(int j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + i += (blockDim.x*gridDim.x); + } +} +__global__ void matrixInfNormPart1(const int n, + const int nnz, + const int* a_ia, + const double* a_val, + double* result) { + + // one thread per row, pass through rows + // and sum + // can be done through atomics + //\sum_{j=1}^m abs(a_{ij}) + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n){ + double sum = 0.0f; + for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x*gridDim.x); + } +} + + +void mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result) +{ + hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n); +} +void mass_axpy(int n, int i, double* x, double* y, double* alpha) +{ + hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha); +} + +void matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result) +{ + hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); +} diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip index 3df2b84b..f68cd0b9 100644 --- a/resolve/hip/hipVectorKernels.hip +++ b/resolve/hip/hipVectorKernels.hip @@ -1,29 +1,28 @@ #include #include - -#include "hipVectorKernels.h" +#include namespace ReSolve { namespace vector { namespace kernels { -__global__ void set_const(index_type n, real_type val, real_type* arr) -{ - index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < n) + __global__ void set_const(index_type n, real_type val, real_type* arr) { - arr[i] = val; + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + while (i < n) + { + arr[i] = val; + i += blockDim.x * gridDim.x; + } } -} - } // namespace kernels -void set_array_const(index_type n, real_type val, real_type* arr) +void set_array_const(index_type n, real_type val, real_type* arr) { - index_type num_blocks; - index_type block_size = 512; - num_blocks = (n + block_size - 1) / block_size; - kernels::set_const<<>>(n, val, arr); + index_type num_blocks; + index_type block_size = 512; + num_blocks = (n + block_size - 1) / block_size; + hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr); } -}} // namespace ReSolve::vector \ No newline at end of file +}} // namespace ReSolve::vector diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 554c0ba7..565fa7c9 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC MatrixHandlerCuda.cpp ) +# and on HIP +set(Matrix_ROCM_SRC + MatrixHandlerHip.cpp +) + # Header files to be installed set(Matrix_HEADER_INSTALL io.hpp @@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA) set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC}) +endif() + # Build shared library ReSolve::matrix add_library(resolve_matrix SHARED ${Matrix_SRC}) @@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip) +endif() + # Link to dummy device backend if GPU support is not enabled if (NOT RESOLVE_USE_GPU) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index c8caebf6..a91f94a9 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -33,8 +33,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -48,8 +48,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -63,8 +63,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -81,9 +81,9 @@ namespace ReSolve setNotUpdated(); int control=-1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -98,7 +98,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); @@ -120,7 +120,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -128,7 +128,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -136,7 +136,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpua mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -176,7 +176,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -215,7 +215,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index 1a305e03..e2ea765f 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -30,8 +30,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -45,8 +45,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -60,8 +60,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -77,9 +77,9 @@ namespace ReSolve int control=-1; setNotUpdated(); if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -94,7 +94,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); @@ -116,7 +116,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -124,7 +124,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -132,7 +132,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -173,7 +173,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -212,7 +212,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index f1ddd31f..dff33b48 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -30,8 +30,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -45,8 +45,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -60,8 +60,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -77,9 +77,9 @@ namespace ReSolve setNotUpdated(); int control = -1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -94,7 +94,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); @@ -118,7 +118,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -126,7 +126,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -134,7 +134,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -174,7 +174,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -212,7 +212,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 8bf4302c..133a09f9 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -13,6 +13,9 @@ #ifdef RESOLVE_USE_CUDA #include "MatrixHandlerCuda.hpp" #endif +#ifdef RESOLVE_USE_HIP +#include "MatrixHandlerHip.hpp" +#endif namespace ReSolve { // Create a shortcut name for Logger static class @@ -41,6 +44,7 @@ namespace ReSolve { { if (isCpuEnabled_) delete cpuImpl_; if (isCudaEnabled_) delete cudaImpl_; + if (isHipEnabled_) delete hipImpl_; } /** @@ -74,12 +78,31 @@ namespace ReSolve { } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief Constructor taking pointer to the CUDA workspace as its parameter. + * + * @post A CPU implementation instance is created because it is cheap and + * it does not require a workspace. + * + * @post A HIP implementation instance is created with supplied workspace. + */ + MatrixHandler::MatrixHandler(LinAlgWorkspaceHIP* new_workspace) + { + cpuImpl_ = new MatrixHandlerCpu(); + hipImpl_ = new MatrixHandlerHip(new_workspace); + isCpuEnabled_ = true; + isHipEnabled_ = true; + } +#endif void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace) { if (memspace == "cpu") { cpuImpl_->setValuesChanged(isValuesChanged); } else if (memspace == "cuda") { cudaImpl_->setValuesChanged(isValuesChanged); + } else if (memspace == "hip") { + hipImpl_->setValuesChanged(isValuesChanged); } else { out::error() << "Unsupported device " << memspace << "\n"; } @@ -230,6 +253,8 @@ namespace ReSolve { } else { if (memspace == "cuda"){ A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); + } else if (memspace == "hip"){ + A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); } else { //display error } @@ -269,6 +294,9 @@ namespace ReSolve { return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else if (memspace == "cpu") { return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); + } else if (memspace == "hip") { + printf("about to run mv"); + return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else { out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl; return 1; @@ -280,6 +308,8 @@ namespace ReSolve { { if (memspace == "cuda") { return cudaImpl_->csc2csr(A_csc, A_csr); + } else if (memspace == "hip") { + return hipImpl_->csc2csr(A_csc, A_csr); } else if (memspace == "cpu") { out::warning() << "Using untested csc2csr on CPU ..." << std::endl; return cpuImpl_->csc2csr(A_csc, A_csr); diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index 398a8039..cec61085 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -18,6 +18,7 @@ namespace ReSolve } class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; class MatrixHandlerImpl; } @@ -48,6 +49,7 @@ namespace ReSolve { MatrixHandler(); MatrixHandler(LinAlgWorkspaceCpu* workspace); MatrixHandler(LinAlgWorkspaceCUDA* workspace); + MatrixHandler(LinAlgWorkspaceHIP* workspace); ~MatrixHandler(); int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); @@ -70,9 +72,11 @@ namespace ReSolve { MemoryHandler mem_; ///< Device memory manager object MatrixHandlerImpl* cpuImpl_{nullptr}; ///< Pointer to CPU implementation MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation + MatrixHandlerImpl* hipImpl_{nullptr}; ///< Pointer to HIP implementation bool isCpuEnabled_{false}; ///< true if CPU implementation is instantiated bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated + bool isHipEnabled_{false}; ///< true if HIP implementation is instantiated }; } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp new file mode 100644 index 00000000..370849fa --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -0,0 +1,154 @@ +#include + +#include +#include +#include +#include +#include +#include +#include "MatrixHandlerHip.hpp" + +namespace ReSolve { + // Create a shortcut name for Logger static class + using out = io::Logger; + + MatrixHandlerHip::~MatrixHandlerHip() + { + } + + MatrixHandlerHip::MatrixHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + void MatrixHandlerHip::setValuesChanged(bool values_changed) + { + values_changed_ = values_changed; + } + + + int MatrixHandlerHip::matvec(matrix::Sparse* Ageneric, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrixFormat) + { + using namespace constants; + int error_sum = 0; + if (matrixFormat == "csr") { + matrix::Csr* A = dynamic_cast(Ageneric); + //result = alpha *A*x + beta * result + rocsparse_status status; + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + + rocsparse_handle handle_rocsparse = workspaceHIP->getRocsparseHandle(); + + rocsparse_mat_info infoA = workspaceHIP->getSpmvMatrixInfo(); + rocsparse_mat_descr descrA = workspaceHIP->getSpmvMatrixDescriptor(); + + if (!workspaceHIP->matvecSetup()) { + //setup first, allocate, etc. + + rocsparse_create_mat_descr(&(descrA)); + rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero); + rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general); + + rocsparse_create_mat_info(&infoA); + + status = rocsparse_dcsrmv_analysis(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + descrA, + A->getValues("cuda"), + A->getRowData("cuda"), + A->getColData("cuda"), // cuda is used as "device" + infoA); + error_sum += status; + mem_.deviceSynchronize(); + + workspaceHIP->matvecSetupDone(); + } + + status = rocsparse_dcsrmv(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + alpha, + descrA, + A->getValues("cuda"), + A->getRowData("cuda"), + A->getColData("cuda"), + infoA, + vec_x->getData("cuda"), + beta, + vec_result->getData("cuda")); + + error_sum += status; + mem_.deviceSynchronize(); + if (status) + out::error() << "Matvec status: " << status + << "Last error code: " << mem_.getLastDeviceError() << std::endl; + vec_result->setDataUpdated("cuda"); + + return error_sum; + } else { + out::error() << "MatVec not implemented (yet) for " + << matrixFormat << " matrix format." << std::endl; + return 1; + } + } + + int MatrixHandlerHip::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */) + { + return -1; + } + + int MatrixHandlerHip::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + index_type error_sum = 0; + LinAlgWorkspaceHIP* workspaceHIP = (LinAlgWorkspaceHIP*) workspace_; + + rocsparse_status status; + + A_csr->allocateMatrixData("cuda"); + index_type n = A_csc->getNumRows(); + index_type m = A_csc->getNumRows(); + index_type nnz = A_csc->getNnz(); + size_t bufferSize; + void* d_work; + + status = rocsparse_csr2csc_buffer_size(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + rocsparse_action_numeric, + &bufferSize); + + error_sum += status; + mem_.allocateBufferOnDevice(&d_work, bufferSize); + + status = rocsparse_dcsr2csc(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + rocsparse_action_numeric, + rocsparse_index_base_zero, + d_work); + error_sum += status; + return error_sum; + mem_.deleteOnDevice(d_work); + } + +} // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp new file mode 100644 index 00000000..7f06f3bd --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } + class LinAlgWorkspaceHIP; +} + + +namespace ReSolve { + /** + * @class MatrixHandlerHip + * + * @brief HIP implementation of the matrix handler. + */ + class MatrixHandlerHip : public MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + + MatrixHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~MatrixHandlerHip(); + + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); + + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); + + virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); + + void setValuesChanged(bool isValuesChanged); + + private: + + LinAlgWorkspaceHIP* workspace_{nullptr}; + bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object + }; + +} // namespace ReSolve + diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt index 16d53010..89b1abc8 100644 --- a/resolve/vector/CMakeLists.txt +++ b/resolve/vector/CMakeLists.txt @@ -18,6 +18,13 @@ set(Vector_CUDASDK_SRC VectorHandlerCuda.cpp ) +#and hip + +set(Vector_ROCM_SRC + VectorHandlerHip.cpp +) + + # Header files to be installed set(Vector_HEADER_INSTALL Vector.hpp @@ -30,6 +37,11 @@ if(RESOLVE_USE_CUDA) set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC}) endif() +# and hip +if(RESOLVE_USE_HIP) + set(Vector_SRC ${Vector_SRC} ${Vector_ROCM_SRC}) +endif() + add_library(resolve_vector SHARED ${Vector_SRC}) target_link_libraries(resolve_vector PRIVATE resolve_logger) @@ -38,6 +50,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_vector PUBLIC resolve_backend_hip) +endif() + # If no GPU is enabled link to dummy device backend if(NOT RESOLVE_USE_GPU) target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index 7934e8b0..37779ea5 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -60,7 +60,7 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { d_data_ = data; gpu_updated_ = true; cpu_updated_ = false; @@ -76,7 +76,7 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { gpu_updated_ = true; cpu_updated_ = false; } else { @@ -89,15 +89,15 @@ namespace ReSolve { namespace vector { { int control=-1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -109,19 +109,19 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; break; - case 2: //cuda->cpu + case 2: //gpu->cpu mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_); owns_gpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; break; - case 1: //cpu->cuda + case 1: //cpu->gpu mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; cpu_updated_ = false; break; - case 3: //cuda->cuda + case 3: //gpu->gpu mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; @@ -141,18 +141,18 @@ namespace ReSolve { namespace vector { real_type* Vector::getData(index_type i, std::string memspace) { if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) { - copyData("cuda", "cpu"); + copyData(memspace, "cpu"); owns_cpu_data_ = true; } - if ((memspace == "cuda") && (gpu_updated_ == false) && (cpu_updated_ == true )) { - copyData("cpu", "cuda"); + if (((memspace == "cuda") || (memspace == "hip")) && (gpu_updated_ == false) && (cpu_updated_ == true )) { + copyData("cpu", memspace); owns_gpu_data_ = true; } if (memspace == "cpu") { return &h_data_[i * n_current_]; } else { - if (memspace == "cuda"){ + if ((memspace == "cuda") || (memspace == "hip")){ return &d_data_[i * n_current_]; } else { return nullptr; @@ -164,14 +164,14 @@ namespace ReSolve { namespace vector { int Vector::copyData(std::string memspaceIn, std::string memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 0;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 1;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 0;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 1;} if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -200,10 +200,12 @@ namespace ReSolve { namespace vector { h_data_ = new real_type[n_ * k_]; owns_cpu_data_ = true; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.deleteOnDevice(d_data_); mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; + } else { + std::cout<<"wrong memspace " < #endif +#ifdef RESOLVE_USE_HIP +#include +#endif namespace ReSolve { using out = io::Logger; @@ -50,6 +53,21 @@ namespace ReSolve { isCpuEnabled_ = true; } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace) + { + hipImpl_ = new VectorHandlerHip(new_workspace); + cpuImpl_ = new VectorHandlerCpu(); + + isHipEnabled_ = true; + isCpuEnabled_ = true; + } +#endif /** * @brief destructor @@ -64,7 +82,7 @@ namespace ReSolve { * * @param[in] x The first vector * @param[in] y The second vector - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @return dot product (real number) of _x_ and _y_ */ @@ -74,7 +92,9 @@ namespace ReSolve { if (memspace == "cuda" ) { return cudaImpl_->dot(x, y); } else { - if (memspace == "cpu") { + if (memspace == "hip") { + return hipImpl_->dot(x, y); + } else if (memspace == "cpu") { return cpuImpl_->dot(x, y); } else { out::error() << "Not implemented (yet)" << std::endl; @@ -88,13 +108,15 @@ namespace ReSolve { * * @param[in] alpha The constant * @param[in,out] x The vector - * @param memspace string containg memspace (cpu or cuda) + * @param memspace string containg memspace (cpu or cuda or hip) * */ void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace) { if (memspace == "cuda" ) { cudaImpl_->scal(alpha, x); + } else if (memspace == "hip") { + hipImpl_->scal(alpha, x); } else { if (memspace == "cpu") { cpuImpl_->scal(alpha, x); @@ -110,7 +132,7 @@ namespace ReSolve { * @param[in] alpha The constant * @param[in] x The first vector * @param[in,out] y The second vector (result is return in y) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * */ void VectorHandler::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace) @@ -119,10 +141,14 @@ namespace ReSolve { if (memspace == "cuda" ) { cudaImpl_->axpy(alpha, x, y); } else { - if (memspace == "cpu") { - cpuImpl_->axpy(alpha, x, y); + if (memspace == "hip" ) { + hipImpl_->axpy(alpha, x, y); } else { - out::error() <<"Not implemented (yet)" << std::endl; + if (memspace == "cpu") { + cpuImpl_->axpy(alpha, x, y); + } else { + out::error() <<"Not implemented (yet)" << std::endl; + } } } } @@ -139,7 +165,7 @@ namespace ReSolve { * @param[in] V Multivector containing the matrix, organized columnwise * @param[in] y Vector, k x 1 if N and n x 1 if T * @param[in,out] x Vector, n x 1 if N and k x 1 if T - * @param[in] memspace cpu or cuda (for now) + * @param[in] memspace cpu or cuda or hip (for now) * * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 * @@ -148,6 +174,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); + } else if (memspace == "hip") { + hipImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else if (memspace == "cpu") { cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else { @@ -162,7 +190,7 @@ namespace ReSolve { * @param[in] alpha vector size k x 1 * @param[in] x (multi)vector size size x k * @param[in,out] y vector size size x 1 (this is where the result is stored) - * @param[in] memspace string containg memspace (cpu or cuda) + * @param[in] memspace string containg memspace (cpu or cuda or hip) * * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() * @@ -172,6 +200,8 @@ namespace ReSolve { using namespace constants; if (memspace == "cuda") { cudaImpl_->massAxpy(size, alpha, k, x, y); + } else if (memspace == "hip") { + hipImpl_->massAxpy(size, alpha, k, x, y); } else if (memspace == "cpu") { cpuImpl_->massAxpy(size, alpha, k, x, y); } else { @@ -188,7 +218,7 @@ namespace ReSolve { * @param[in] k Number of vectors in V * @param[in] x Multivector; 2 vectors size n x 1 each * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated * @@ -197,6 +227,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->massDot2Vec(size, V, k, x, res); + } else if (memspace == "hip") { + hipImpl_->massDot2Vec(size, V, k, x, res); } else if (memspace == "cpu") { cpuImpl_->massDot2Vec(size, V, k, x, res); } else { diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp index c17d4688..02d426b5 100644 --- a/resolve/vector/VectorHandler.hpp +++ b/resolve/vector/VectorHandler.hpp @@ -10,6 +10,7 @@ namespace ReSolve class VectorHandlerImpl; class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; } @@ -19,6 +20,7 @@ namespace ReSolve { //namespace vector { VectorHandler(); VectorHandler(LinAlgWorkspaceCpu* new_workspace); VectorHandler(LinAlgWorkspaceCUDA* new_workspace); + VectorHandler(LinAlgWorkspaceHIP* new_workspace); ~VectorHandler(); //y = alpha x + y @@ -55,9 +57,11 @@ namespace ReSolve { //namespace vector { private: VectorHandlerImpl* cpuImpl_{nullptr}; VectorHandlerImpl* cudaImpl_{nullptr}; + VectorHandlerImpl* hipImpl_{nullptr}; bool isCpuEnabled_{false}; bool isCudaEnabled_{false}; + bool isHipEnabled_{false}; }; } //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp new file mode 100644 index 00000000..9f2927c7 --- /dev/null +++ b/resolve/vector/VectorHandlerHip.cpp @@ -0,0 +1,236 @@ +#include + +#include +#include +#include +#include +#include +#include "VectorHandlerHip.hpp" + +namespace ReSolve { + using out = io::Logger; + + /** + * @brief empty constructor that does absolutely nothing + */ + VectorHandlerHip::VectorHandlerHip() + { + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandlerHip:: VectorHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + /** + * @brief destructor + */ + VectorHandlerHip::~VectorHandlerHip() + { + //delete the workspace TODO + } + + /** + * @brief dot product of two vectors i.e, a = x^Ty + * + * @param[in] x The first vector + * @param[in] y The second vector + * @param[in] memspace String containg memspace (cpu or hip) + * + * @return dot product (real number) of _x_ and _y_ + */ + + real_type VectorHandlerHip::dot(vector::Vector* x, vector::Vector* y) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + double nrm = 0.0; + rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData("hip"), 1, y->getData("hip"), 1, &nrm); + if (st!=0) {printf("dot product crashed with code %d \n", st);} + return nrm; + } + + /** + * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant + * + * @param[in] alpha The constant + * @param[in,out] x The vector + * @param memspace string containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::scal(const real_type* alpha, vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData("hip"), 1); + if (st!=0) { + ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; + } + } + + /** + * @brief axpy i.e, y = alpha*x+y where alpha is a constant + * + * @param[in] alpha The constant + * @param[in] x The first vector + * @param[in,out] y The second vector (result is return in y) + * @param[in] memspace String containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) + { + //AXPY: y = alpha * x + y + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_daxpy(handle_rocblas, + x->getSize(), + alpha, + x->getData("hip"), + 1, + y->getData("hip"), + 1); + } + + /** + * @brief gemv computes matrix-vector product where both matrix and vectors are dense. + * i.e., x = beta*x + alpha*V*y + * + * @param[in] Transpose - yes (T) or no (N) + * @param[in] n Number of rows in (non-transposed) matrix + * @param[in] k Number of columns in (non-transposed) + * @param[in] alpha Constant real number + * @param[in] beta Constant real number + * @param[in] V Multivector containing the matrix, organized columnwise + * @param[in] y Vector, k x 1 if N and n x 1 if T + * @param[in,out] x Vector, n x 1 if N and k x 1 if T + * @param[in] memspace cpu or hip (for now) + * + * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 + * + */ + void VectorHandlerHip::gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + if (transpose == "T") { + + rocblas_dgemv(handle_rocblas, + rocblas_operation_transpose, + n, + k, + alpha, + V->getData("hip"), + n, + y->getData("hip"), + 1, + beta, + x->getData("hip"), + 1); + + } else { + rocblas_dgemv(handle_rocblas, + rocblas_operation_none, + n, + k, + alpha, + V->getData("hip"), + n, + y->getData("hip"), + 1, + beta, + x->getData("hip"), + 1); + } + } + + /** + * @brief mass (bulk) axpy i.e, y = y - x*alpha where alpha is a vector + * + * @param[in] size number of elements in y + * @param[in] alpha vector size k x 1 + * @param[in] x (multi)vector size size x k + * @param[in,out] y vector size size x 1 (this is where the result is stored) + * @param[in] memspace string containg memspace (cpu or hip) + * + * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() + * + */ + void VectorHandlerHip::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) + { + using namespace constants; + if (k < 200) { + mass_axpy(size, k, x->getData("hip"), y->getData("hip"),alpha->getData("hip")); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_none, + rocblas_operation_none, + size, // m + 1, // n + k, // k + &MINUSONE, // alpha + x->getData("hip"), // A + size, // lda + alpha->getData("hip"), // B + k, // ldb + &ONE, + y->getData("hip"), // c + size); // ldc + } + } + + /** + * @brief mass (bulk) dot product i.e, V^T x, where V is n x k dense multivector + * (a dense multivector consisting of k vectors size n) and x is k x 2 dense + * multivector (a multivector consisiting of two vectors size n each) + * + * @param[in] size Number of elements in a single vector in V + * @param[in] V Multivector; k vectors size n x 1 each + * @param[in] k Number of vectors in V + * @param[in] x Multivector; 2 vectors size n x 1 each + * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) + * @param[in] memspace String containg memspace (cpu or hip) + * + * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated + * + */ + void VectorHandlerHip::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) + { + using namespace constants; + + if (k < 200) { + mass_inner_product_two_vectors(size, k, x->getData("hip") , x->getData(1, "hip"), V->getData("hip"), res->getData("hip")); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_transpose, + rocblas_operation_none, + k + 1, //m + 2, //n + size, //k + &ONE, //alpha + V->getData("hip"), //A + size, //lda + x->getData("hip"), //B + size, //ldb + &ZERO, + res->getData("hip"), //c + k + 1); //ldc + } + } + +} // namespace ReSolve diff --git a/resolve/vector/VectorHandlerHip.hpp b/resolve/vector/VectorHandlerHip.hpp new file mode 100644 index 00000000..7e5085e3 --- /dev/null +++ b/resolve/vector/VectorHandlerHip.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class LinAlgWorkspaceHIP; + class VectorHandlerImpl; +} + + +namespace ReSolve { //namespace vector { + class VectorHandlerHip : public VectorHandlerImpl + { + public: + VectorHandlerHip(); + VectorHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~VectorHandlerHip(); + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y); + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y); + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x); + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y); + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res); + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x); + private: + LinAlgWorkspaceHIP* workspace_; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt index 673fac4b..a34c2191 100644 --- a/resolve/workspace/CMakeLists.txt +++ b/resolve/workspace/CMakeLists.txt @@ -16,10 +16,15 @@ set(ReSolve_Workspace_CUDASDK_SRC LinAlgWorkspaceCUDA.cpp ) +set(ReSolve_Workspace_ROCM_SRC + LinAlgWorkspaceHIP.cpp +) + set(ReSolve_Workspace_HEADER_INSTALL LinAlgWorkspace.hpp LinAlgWorkspaceCpu.hpp LinAlgWorkspaceCUDA.hpp + LinAlgWorkspaceHIP.hpp ) # If cuda is enabled, add CUDA SDK workspace files @@ -27,6 +32,10 @@ if(RESOLVE_USE_CUDA) set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_ROCM_SRC}) +endif() + add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC}) # If CUDA is enabled, link to ReSolve CUDA backend @@ -34,6 +43,10 @@ if(RESOLVE_USE_CUDA) target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip) +endif(RESOLVE_USE_HIP) + target_include_directories(resolve_workspace INTERFACE $ $ diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp index 6da58fda..4efe834e 100644 --- a/resolve/workspace/LinAlgWorkspace.hpp +++ b/resolve/workspace/LinAlgWorkspace.hpp @@ -6,3 +6,7 @@ #include #endif +#ifdef RESOLVE_USE_HIP +#include +#endif + diff --git a/resolve/workspace/LinAlgWorkspaceHIP.cpp b/resolve/workspace/LinAlgWorkspaceHIP.cpp new file mode 100644 index 00000000..e64dff17 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.cpp @@ -0,0 +1,75 @@ +#include + +namespace ReSolve +{ + LinAlgWorkspaceHIP::LinAlgWorkspaceHIP() + { + handle_rocsparse_ = nullptr; + handle_rocblas_ = nullptr; + + matvec_setup_done_ = false; + } + + LinAlgWorkspaceHIP::~LinAlgWorkspaceHIP() + { + rocsparse_destroy_handle(handle_rocsparse_); + rocblas_destroy_handle(handle_rocblas_); + rocsparse_destroy_mat_descr(mat_A_); + } + + rocsparse_handle LinAlgWorkspaceHIP::getRocsparseHandle() + { + return handle_rocsparse_; + } + + void LinAlgWorkspaceHIP::setRocsparseHandle(rocsparse_handle handle) + { + handle_rocsparse_ = handle; + } + + rocblas_handle LinAlgWorkspaceHIP::getRocblasHandle() + { + return handle_rocblas_; + } + + void LinAlgWorkspaceHIP::setRocblasHandle(rocblas_handle handle) + { + handle_rocblas_ = handle; + } + + rocsparse_mat_descr LinAlgWorkspaceHIP::getSpmvMatrixDescriptor() + { + return mat_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixDescriptor(rocsparse_mat_descr mat) + { + mat_A_ = mat; + } + + rocsparse_mat_info LinAlgWorkspaceHIP::getSpmvMatrixInfo() + { + return info_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixInfo(rocsparse_mat_info info) + { + info_A_ = info; + } + + bool LinAlgWorkspaceHIP::matvecSetup() + { + return matvec_setup_done_; + } + + void LinAlgWorkspaceHIP::matvecSetupDone() + { + matvec_setup_done_ = true; + } + + void LinAlgWorkspaceHIP::initializeHandles() + { + rocsparse_create_handle(&handle_rocsparse_); + rocblas_create_handle(&handle_rocblas_); + } + } // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp new file mode 100644 index 00000000..fbb55349 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include + +#include + +namespace ReSolve +{ + class LinAlgWorkspaceHIP + { + public: + LinAlgWorkspaceHIP(); + ~LinAlgWorkspaceHIP(); + + rocblas_handle getRocblasHandle(); + rocsparse_handle getRocsparseHandle(); + rocsparse_mat_descr getSpmvMatrixDescriptor(); + rocsparse_mat_info getSpmvMatrixInfo(); + + void setRocblasHandle(rocblas_handle handle); + void setRocsparseHandle(rocsparse_handle handle); + void setSpmvMatrixDescriptor(rocsparse_mat_descr mat); + void setSpmvMatrixInfo(rocsparse_mat_info info); + + void initializeHandles(); + + bool matvecSetup(); + void matvecSetupDone(); + + private: + //handles + rocblas_handle handle_rocblas_; + rocsparse_handle handle_rocsparse_; + + //matrix descriptors + rocsparse_mat_descr mat_A_; + + //vector descriptors not needed, rocsparse uses RAW pointers. + + //buffers + // there is no buffer needed in matvec + bool matvec_setup_done_; //check if setup is done for matvec (note: no buffer but there is analysis) + + //info - but we need info + rocsparse_mat_info info_A_; + + MemoryHandler mem_; + }; + +} // namespace ReSolve diff --git a/tests/unit/matrix/CMakeLists.txt b/tests/unit/matrix/CMakeLists.txt index 8906c2c6..8476f181 100644 --- a/tests/unit/matrix/CMakeLists.txt +++ b/tests/unit/matrix/CMakeLists.txt @@ -20,4 +20,4 @@ install(TARGETS ${installable_tests} RUNTIME DESTINATION bin/resolve/tests/unit) add_test(NAME matrix_test COMMAND $) -add_test(NAME matrix_handler_test COMMAND $) \ No newline at end of file +add_test(NAME matrix_handler_test COMMAND $) diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index e203017a..d7fe8449 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -49,6 +49,7 @@ class MatrixHandlerTests : TestBase vector::Vector x(N); vector::Vector y(N); x.allocate(memspace_); + if (x.getData(memspace_) == NULL) printf("oups we have an issue \n"); y.allocate(memspace_); x.setToConst(1.0, memspace_); @@ -80,6 +81,12 @@ class MatrixHandlerTests : TestBase LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new MatrixHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new MatrixHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -152,7 +159,7 @@ class MatrixHandlerTests : TestBase A->setUpdated("cpu"); // std::cout << rowptr[i] << "\n"; - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { A->copyData(memspace); } diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index 6eee90d5..26ad70b0 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -33,5 +33,17 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::MatrixHandlerTests test("hip"); + + result += test.matrixHandlerConstructor(); + result += test.matrixOneNorm(); + result += test.matVec(50); + + std::cout << "\n"; + } +#endif return result.summary(); } diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp index d2f8c73c..60020ec5 100644 --- a/tests/unit/vector/VectorHandlerTests.hpp +++ b/tests/unit/vector/VectorHandlerTests.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -141,13 +142,13 @@ namespace ReSolve { } x->setToConst(ii, c, memspace_); } + index_type r = K % 2; real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5)); handler->massAxpy(N, alpha, K, x, y, memspace_); status *= verifyAnswer(y, 2.0 - res, memspace_); - - + delete handler; delete x; delete y; @@ -229,6 +230,12 @@ namespace ReSolve { LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new VectorHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new VectorHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -247,6 +254,7 @@ namespace ReSolve { for (index_type i = 0; i < x->getSize(); ++i) { // std::cout << x->getData("cpu")[i] << "\n"; if (!isEqual(x->getData("cpu")[i], answer)) { + std::cout << std::setprecision(16); status = false; std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i] << ", expected: " << answer << "\n"; diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp index 77e99471..9bb543a5 100644 --- a/tests/unit/vector/runVectorHandlerTests.cpp +++ b/tests/unit/vector/runVectorHandlerTests.cpp @@ -37,5 +37,22 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::VectorHandlerTests test("hip"); + + result += test.dot(5000); + result += test.axpy(5000); + result += test.scal(5000); + result += test.gemv(5000, 10); + result += test.massAxpy(100, 10); + result += test.massAxpy(1000, 300); + result += test.massDot(100, 10); + result += test.massDot(1000, 30); + + std::cout << "\n"; + } +#endif return result.summary(); } From 7ea6515adbd6b282e502598165a5ff043e02257f Mon Sep 17 00:00:00 2001 From: pelesh Date: Mon, 30 Oct 2023 20:26:10 -0400 Subject: [PATCH 03/12] Fix warnings in HIP branch. (#41) * Fix warnings in HIP branch. --- resolve/LinSolverDirectKLU.cpp | 6 ++++-- resolve/MemoryUtils.hpp | 9 +++++++++ resolve/matrix/Coo.cpp | 6 +++--- resolve/matrix/Csc.cpp | 6 +++--- resolve/matrix/Csr.cpp | 6 +++--- resolve/matrix/MatrixHandlerCpu.hpp | 2 +- resolve/matrix/Sparse.cpp | 2 +- resolve/utilities/logger/Logger.cpp | 2 +- resolve/vector/Vector.cpp | 6 +++--- resolve/workspace/LinAlgWorkspaceCpu.cpp | 1 + resolve/workspace/LinAlgWorkspaceCpu.hpp | 2 +- resolve/workspace/LinAlgWorkspaceHIP.hpp | 2 +- tests/unit/matrix/MatrixHandlerTests.hpp | 18 +++++++++--------- 13 files changed, 40 insertions(+), 28 deletions(-) diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp index b3f670c4..43b612b3 100644 --- a/resolve/LinSolverDirectKLU.cpp +++ b/resolve/LinSolverDirectKLU.cpp @@ -157,7 +157,8 @@ namespace ReSolve { if (Numeric_ != nullptr){ P_ = new index_type[A_->getNumRows()]; - std::memcpy(P_, Numeric_->Pnum, A_->getNumRows() * sizeof(index_type)); + size_t nrows = static_cast(A_->getNumRows()); + std::memcpy(P_, Numeric_->Pnum, nrows * sizeof(index_type)); return P_; } else { return nullptr; @@ -169,7 +170,8 @@ namespace ReSolve { if (Numeric_ != nullptr){ Q_ = new index_type[A_->getNumRows()]; - std::memcpy(Q_, Symbolic_->Q, A_->getNumRows() * sizeof(index_type)); + size_t nrows = static_cast(A_->getNumRows()); + std::memcpy(Q_, Symbolic_->Q, nrows * sizeof(index_type)); return Q_; } else { return nullptr; diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 976279d9..5e2da403 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -44,6 +44,15 @@ namespace ReSolve template int copyArrayHostToDevice(T* dst, const T* src, I n); + + /// Implemented here as it is always needed + template + int copyArrayHostToHost(T* dst, const T* src, I n) + { + size_t nelements = static_cast(n); + memcpy(dst, src, nelements * sizeof(T)); + return 0; + } }; } // namespace ReSolve diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index a91f94a9..eeff7b86 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -113,9 +113,9 @@ namespace ReSolve switch(control) { case 0: //cpu->cpu - std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index e2ea765f..f6358df3 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -109,9 +109,9 @@ namespace ReSolve switch(control) { case 0: //cpu->cpu - std::memcpy(h_col_data_, col_data, (n_ + 1) * sizeof(index_type)); - std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_col_data_, col_data, n_ + 1); + mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index dff33b48..04e8dff1 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -111,9 +111,9 @@ namespace ReSolve //copy switch(control) { case 0: //cpu->cpu - std::memcpy(h_row_data_, row_data, (n_ + 1) * sizeof(index_type)); - std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_row_data_, row_data, n_ + 1); + mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp index 0b0afbd3..b6e66066 100644 --- a/resolve/matrix/MatrixHandlerCpu.hpp +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -50,7 +50,7 @@ namespace ReSolve { LinAlgWorkspaceCpu* workspace_{nullptr}; bool values_changed_{true}; ///< needed for matvec - MemoryHandler mem_; ///< Device memory manager object + // MemoryHandler mem_; ///< Device memory manager object not used for now }; } // namespace ReSolve diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp index 5c866386..4a16ec98 100644 --- a/resolve/matrix/Sparse.cpp +++ b/resolve/matrix/Sparse.cpp @@ -228,7 +228,7 @@ namespace ReSolve { namespace matrix { switch(control) { case 0: //cpu->cpu - std::memcpy(h_val_data_, new_vals, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_val_data_, new_vals, nnz_current); h_data_updated_ = true; owns_cpu_vals_ = true; break; diff --git a/resolve/utilities/logger/Logger.cpp b/resolve/utilities/logger/Logger.cpp index f2448179..7369978f 100644 --- a/resolve/utilities/logger/Logger.cpp +++ b/resolve/utilities/logger/Logger.cpp @@ -59,7 +59,7 @@ namespace ReSolve */ void Logger::updateVerbosity(std::vector& output_streams) { - for (int i = NONE; i <= EVERYTHING; ++i) + for (std::size_t i = NONE; i <= EVERYTHING; ++i) { output_streams[i] = i > verbosity_ ? &nullstream_ : logger_; } diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index 37779ea5..df3c475d 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -104,7 +104,7 @@ namespace ReSolve { namespace vector { switch(control) { case 0: //cpu->cpu - std::memcpy(h_data_, data, (n_current_ * k_) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_data_, data, n_current_ * k_); owns_cpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; @@ -322,7 +322,7 @@ namespace ReSolve { namespace vector { } else { real_type* data = this->getData(i, memspaceOut); if (memspaceOut == "cpu") { - std::memcpy(dest, data, n_current_ * sizeof(real_type)); + mem_.copyArrayHostToHost(dest, data, n_current_); } else { if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { mem_.copyArrayDeviceToDevice(dest, data, n_current_); @@ -338,7 +338,7 @@ namespace ReSolve { namespace vector { { real_type* data = this->getData(memspaceOut); if (memspaceOut == "cpu") { - std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type)); + mem_.copyArrayHostToHost(dest, data, n_current_ * k_); } else { if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_); diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp index 3ed9aa43..c0f25248 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.cpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp @@ -1,3 +1,4 @@ +#include #include "LinAlgWorkspaceCpu.hpp" namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp index 00e5f38e..3c056b73 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.hpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp @@ -12,7 +12,7 @@ namespace ReSolve ~LinAlgWorkspaceCpu(); void initializeHandles(); private: - MemoryHandler mem_; + // MemoryHandler mem_; ///< Memory handler not needed for now }; } diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp index fbb55349..abdc3e41 100644 --- a/resolve/workspace/LinAlgWorkspaceHIP.hpp +++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp @@ -46,7 +46,7 @@ namespace ReSolve //info - but we need info rocsparse_mat_info info_A_; - MemoryHandler mem_; + // MemoryHandler mem_; ///< Memory handler not needed for now }; } // namespace ReSolve diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index d7fe8449..0bcfe544 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -125,13 +125,15 @@ class MatrixHandlerTests : TestBase // std::cout << N << "\n"; + // First compute number of nonzeros index_type NNZ = 0; for (index_type i = 0; i < N; ++i) { - NNZ += static_cast(data[i%5].size()); + size_t reminder = static_cast(i%5); + NNZ += static_cast(data[reminder].size()); } - // std::cout << NNZ << "\n"; + // Allocate NxN CSR matrix with NNZ nonzeros matrix::Csr* A = new matrix::Csr(N, N, NNZ); A->allocateMatrixData("cpu"); @@ -139,25 +141,23 @@ class MatrixHandlerTests : TestBase index_type* colidx = A->getColData("cpu"); real_type* val = A->getValues("cpu"); + // Populate CSR matrix using same row pattern as for NNZ calculation rowptr[0] = 0; - index_type i = 0; - for (i=0; i < N; ++i) + for (index_type i=0; i < N; ++i) { - const std::vector& row_sample = data[i%5]; + size_t reminder = static_cast(i%5); + const std::vector& row_sample = data[reminder]; index_type nnz_per_row = static_cast(row_sample.size()); - // std::cout << nnz_per_row << "\n"; rowptr[i+1] = rowptr[i] + nnz_per_row; for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j) { colidx[j] = (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row)); // evenly distribute nonzeros ^^^^ ^^^^^^^^ perturb offset - val[j] = row_sample[j - rowptr[i]]; - // std::cout << i << " " << colidx[j] << " " << val[j] << "\n"; + val[j] = row_sample[static_cast(j - rowptr[i])]; } } A->setUpdated("cpu"); - // std::cout << rowptr[i] << "\n"; if ((memspace == "cuda") || (memspace == "hip")) { A->copyData(memspace); From 949680f3b320f8cb4541d30148f7b3c7f670f7c2 Mon Sep 17 00:00:00 2001 From: pelesh Date: Tue, 31 Oct 2023 20:18:27 -0400 Subject: [PATCH 04/12] Use enums instead of strings to denote memory space (#42) * Use enums for memory space ID in matrix classes. * Use enums for vector class memory space IDs. --- examples/r_KLU_GLU.cpp | 12 +- examples/r_KLU_GLU_matrix_values_update.cpp | 14 +- examples/r_KLU_KLU.cpp | 8 +- examples/r_KLU_KLU_standalone.cpp | 6 +- examples/r_KLU_rf.cpp | 8 +- examples/r_KLU_rf_FGMRES.cpp | 18 +- .../r_KLU_rf_FGMRES_reuse_factorization.cpp | 24 +- resolve/GramSchmidt.cpp | 72 +++--- resolve/LinSolverDirectCuSolverGLU.cpp | 44 ++-- resolve/LinSolverDirectCuSolverRf.cpp | 32 +-- resolve/LinSolverDirectKLU.cpp | 52 ++-- resolve/LinSolverIterativeFGMRES.cpp | 18 +- resolve/MemoryUtils.hpp | 10 + resolve/matrix/Coo.cpp | 159 ++++++------ resolve/matrix/Coo.hpp | 14 +- resolve/matrix/Csc.cpp | 159 ++++++------ resolve/matrix/Csc.hpp | 14 +- resolve/matrix/Csr.cpp | 161 ++++++------ resolve/matrix/Csr.hpp | 14 +- resolve/matrix/MatrixHandler.cpp | 12 +- resolve/matrix/MatrixHandlerCpu.cpp | 24 +- resolve/matrix/MatrixHandlerCuda.cpp | 38 +-- resolve/matrix/MatrixHandlerHip.cpp | 36 +-- resolve/matrix/Sparse.cpp | 109 ++++---- resolve/matrix/Sparse.hpp | 24 +- resolve/matrix/io.cpp | 10 +- resolve/vector/Vector.cpp | 239 +++++++++--------- resolve/vector/Vector.hpp | 28 +- resolve/vector/VectorHandlerCpu.cpp | 10 +- resolve/vector/VectorHandlerCuda.cpp | 36 +-- resolve/vector/VectorHandlerHip.cpp | 36 +-- tests/functionality/testKLU.cpp | 24 +- tests/functionality/testKLU_GLU.cpp | 28 +- tests/functionality/testKLU_Rf.cpp | 22 +- tests/functionality/testKLU_Rf_FGMRES.cpp | 28 +- tests/unit/matrix/MatrixHandlerTests.hpp | 35 +-- tests/unit/matrix/MatrixIoTests.hpp | 8 +- tests/unit/vector/GramSchmidtTests.hpp | 32 ++- tests/unit/vector/VectorHandlerTests.hpp | 107 +++++--- 39 files changed, 898 insertions(+), 827 deletions(-) diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp index e7b19f4e..9f271254 100644 --- a/examples/r_KLU_GLU.cpp +++ b/examples/r_KLU_GLU.cpp @@ -93,8 +93,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); @@ -107,11 +107,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 1) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"CUSOLVER GLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp index ee99f0a0..ded685ac 100644 --- a/examples/r_KLU_GLU_matrix_values_update.cpp +++ b/examples/r_KLU_GLU_matrix_values_update.cpp @@ -96,8 +96,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { if (i==1) { @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) ReSolve::io::readAndUpdateMatrix(mat_file, A_exp_coo); } std::cout<<"Updating values of A_coo!"<updateValues(A_exp_coo->getValues("cpu"), "cpu", "cpu"); + A_coo->updateValues(A_exp_coo->getValues(ReSolve::memory::HOST), ReSolve::memory::HOST, ReSolve::memory::HOST); //ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); } @@ -117,11 +117,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 1) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"CUSOLVER GLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp index b9328e8a..901e36a5 100644 --- a/examples/r_KLU_KLU.cpp +++ b/examples/r_KLU_KLU.cpp @@ -108,11 +108,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp index 0b8f6114..3dfaf716 100644 --- a/examples/r_KLU_KLU_standalone.cpp +++ b/examples/r_KLU_KLU_standalone.cpp @@ -83,8 +83,8 @@ int main(int argc, char *argv[]) //Now convert to CSR. matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); std::cout << "COO to CSR completed. Expanded NNZ: " << A->getNnzExpanded() << std::endl; //Now call direct solver KLU->setupParameters(1, 0.1, false); @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) std::cout << "KLU factorization status: " << status << std::endl; status = KLU->solve(vec_rhs, vec_x); std::cout << "KLU solve status: " << status << std::endl; - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp index 7369af18..d9310773 100644 --- a/examples/r_KLU_rf.cpp +++ b/examples/r_KLU_rf.cpp @@ -107,11 +107,11 @@ int main(int argc, char *argv[] ) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); //std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp index 07839cbb..6df5419a 100644 --- a/examples/r_KLU_rf_FGMRES.cpp +++ b/examples/r_KLU_rf_FGMRES.cpp @@ -96,8 +96,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { @@ -111,11 +111,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo,A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -162,8 +162,8 @@ int main(int argc, char *argv[]) status = Rf->solve(vec_rhs, vec_x); std::cout<<"CUSOLVER RF solve status: "<update(rhs, "cpu", "cuda"); - norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); //matrix_handler->setValuesChanged(true, "cuda"); @@ -176,7 +176,7 @@ int main(int argc, char *argv[]) << std::scientific << std::setprecision(16) << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n"; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); FGMRES->solve(vec_rhs, vec_x); std::cout << "FGMRES: init nrm: " diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp index 56ab43fe..5ead8186 100644 --- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp +++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp @@ -98,8 +98,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { @@ -113,11 +113,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo,A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -171,20 +171,20 @@ int main(int argc, char *argv[]) status = Rf->refactorize(); std::cout << "CUSOLVER RF, using REAL refactorization, refactorization status: " << status << std::endl; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->solve(vec_rhs, vec_x); FGMRES->setupPreconditioner("CuSolverRf", Rf); } - //if (i%2!=0) vec_x->setToZero("cuda"); + //if (i%2!=0) vec_x->setToZero(ReSolve::memory::DEVICE); real_type norm_x = vector_handler->dot(vec_x, vec_x, "cuda"); std::cout << "Norm of x (before solve): " << std::scientific << std::setprecision(16) << sqrt(norm_x) << "\n"; std::cout<<"CUSOLVER RF solve status: "<update(rhs, "cpu", "cuda"); - vec_r->update(rhs, "cpu", "cuda"); - norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) << std::scientific << std::setprecision(16) << norm_b << "\n"; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); FGMRES->solve(vec_rhs, vec_x); std::cout << "FGMRES: init nrm: " diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp index b6a27b04..fb86fc8d 100644 --- a/resolve/GramSchmidt.cpp +++ b/resolve/GramSchmidt.cpp @@ -36,10 +36,10 @@ namespace ReSolve delete h_L_; delete h_rv_; - vec_rv_->setData(nullptr, "cuda"); - vec_rv_->setData(nullptr, "cpu"); - vec_Hcolumn_->setData(nullptr, "cuda"); - vec_Hcolumn_->setData(nullptr, "cpu"); + vec_rv_->setData(nullptr, memory::DEVICE); + vec_rv_->setData(nullptr, memory::HOST); + vec_Hcolumn_->setData(nullptr, memory::DEVICE); + vec_Hcolumn_->setData(nullptr, memory::HOST); delete [] vec_rv_; delete [] vec_Hcolumn_;; @@ -47,18 +47,18 @@ namespace ReSolve if(variant_ == cgs2) { delete h_aux_; - vec_Hcolumn_->setData(nullptr, "cuda"); - // vec_Hcolumn_->setData(nullptr, "cpu"); + vec_Hcolumn_->setData(nullptr, memory::DEVICE); + // vec_Hcolumn_->setData(nullptr, memory::HOST); delete [] vec_Hcolumn_; } if(variant_ == mgs_pm) { delete h_aux_; } - vec_v_->setData(nullptr, "cuda"); - vec_v_->setData(nullptr, "cpu"); - vec_w_->setData(nullptr, "cuda"); - vec_w_->setData(nullptr, "cpu"); + vec_v_->setData(nullptr, memory::DEVICE); + vec_v_->setData(nullptr, memory::HOST); + vec_w_->setData(nullptr, memory::DEVICE); + vec_w_->setData(nullptr, memory::HOST); delete [] vec_w_; delete [] vec_v_; @@ -103,15 +103,15 @@ namespace ReSolve h_rv_ = new real_type[num_vecs_ + 1]; vec_rv_ = new vector_type(num_vecs_ + 1, 2); - vec_rv_->allocate("cuda"); + vec_rv_->allocate(memory::DEVICE); vec_Hcolumn_ = new vector_type(num_vecs_ + 1); - vec_Hcolumn_->allocate("cuda"); + vec_Hcolumn_->allocate(memory::DEVICE); } if(variant_ == cgs2) { h_aux_ = new real_type[num_vecs_ + 1]; vec_Hcolumn_ = new vector_type(num_vecs_ + 1); - vec_Hcolumn_->allocate("cuda"); + vec_Hcolumn_->allocate(memory::DEVICE); } if(variant_ == mgs_pm) { @@ -135,10 +135,10 @@ namespace ReSolve switch (variant_){ case mgs: - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); for(int j = 0; j <= i; ++j) { t = 0.0; - vec_v_->setData( V->getVectorData(j, "cuda"), "cuda"); + vec_v_->setData( V->getVectorData(j, memory::DEVICE), memory::DEVICE); t = vector_handler_->dot(vec_v_, vec_w_, "cuda"); H[ idxmap(i, j, num_vecs_ + 1) ] = t; t *= -1.0; @@ -159,26 +159,26 @@ namespace ReSolve break; case cgs2: - vec_v_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_,"cuda"); + vec_v_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, "cuda"); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); // copy H_col to aux, we will need it later - vec_Hcolumn_->setDataUpdated("cuda"); + vec_Hcolumn_->setDataUpdated(memory::DEVICE); vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, "cpu"); + vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST); //Hcol = V(:,1:i)^T*V(:,i+1); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_,"cuda"); + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, "cuda"); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); // copy H_col to H - vec_Hcolumn_->setDataUpdated("cuda"); - vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); + vec_Hcolumn_->setDataUpdated(memory::DEVICE); + vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); // add both pieces together (unstable otherwise, careful here!!) t = 0.0; @@ -201,16 +201,16 @@ namespace ReSolve break; case mgs_two_synch: // V[1:i]^T[V[i] w] - vec_v_->setData(V->getVectorData(i, "cuda"), "cuda"); - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); - vec_rv_->setDataUpdated("cuda"); - vec_rv_->copyData("cuda", "cpu"); + vec_rv_->setDataUpdated(memory::DEVICE); + vec_rv_->copyData(memory::DEVICE, memory::HOST); - vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); - h_rv_ = vec_rv_->getVectorData(1, "cpu"); + vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); + h_rv_ = vec_rv_->getVectorData(1, memory::HOST); for(int j=0; j<=i; ++j) { H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0; @@ -225,7 +225,7 @@ namespace ReSolve H[ idxmap(i, j, num_vecs_ + 1) ] -= s; } // for j vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); + vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); // normalize (second synch) @@ -243,16 +243,16 @@ namespace ReSolve return 0; break; case mgs_pm: - vec_v_->setData(V->getVectorData(i, "cuda"), "cuda"); - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); - vec_rv_->setDataUpdated("cuda"); - vec_rv_->copyData("cuda", "cpu"); + vec_rv_->setDataUpdated(memory::DEVICE); + vec_rv_->copyData(memory::DEVICE, memory::HOST); - vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); - h_rv_ = vec_rv_->getVectorData(1, "cpu"); + vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); + h_rv_ = vec_rv_->getVectorData(1, memory::HOST); for(int j = 0; j <= i; ++j) { H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0; @@ -295,7 +295,7 @@ namespace ReSolve } vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); + vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); // normalize (second synch) diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp index 75039ff4..0350efea 100644 --- a/resolve/LinSolverDirectCuSolverGLU.cpp +++ b/resolve/LinSolverDirectCuSolverGLU.cpp @@ -50,14 +50,14 @@ namespace ReSolve n, nnz, descr_A_, - A_->getRowData("cpu"), //kRowPtr_, - A_->getColData("cpu"), //jCol_, + A_->getRowData(memory::HOST), //kRowPtr_, + A_->getColData(memory::HOST), //jCol_, P, /* base-0 */ Q, /* base-0 */ M_->getNnz(), /* nnzM */ descr_M_, - M_->getRowData("cpu"), - M_->getColData("cpu"), + M_->getRowData(memory::HOST), + M_->getColData(memory::HOST), info_M_); error_sum += status_cusolver_; //NOW the buffer @@ -77,9 +77,9 @@ namespace ReSolve /* A is original matrix */ nnz, descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, info_M_); error_sum += status_cusolver_; @@ -93,15 +93,15 @@ namespace ReSolve { // L and U need to be in CSC format index_type n = L->getNumRows(); - index_type* Lp = L->getColData("cpu"); - index_type* Li = L->getRowData("cpu"); - index_type* Up = U->getColData("cpu"); - index_type* Ui = U->getRowData("cpu"); + index_type* Lp = L->getColData(memory::HOST); + index_type* Li = L->getRowData(memory::HOST); + index_type* Up = U->getColData(memory::HOST); + index_type* Ui = U->getRowData(memory::HOST); index_type nnzM = ( L->getNnz() + U->getNnz() - n ); M_ = new matrix::Csr(n, n, nnzM); - M_->allocateMatrixData("cpu"); - index_type* mia = M_->getRowData("cpu"); - index_type* mja = M_->getColData("cpu"); + M_->allocateMatrixData(memory::HOST); + index_type* mia = M_->getRowData(memory::HOST); + index_type* mja = M_->getColData(memory::HOST); index_type row; for(index_type i = 0; i < n; ++i) { // go through EACH COLUMN OF L first @@ -153,9 +153,9 @@ namespace ReSolve /* A is original matrix */ A_->getNnzExpanded(), descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, info_M_); error_sum += status_cusolver_; @@ -173,11 +173,11 @@ namespace ReSolve /* A is original matrix */ A_->getNnz(), descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, - rhs->getData("cuda"),/* right hand side */ - x->getData("cuda"),/* left hand side */ + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, + rhs->getData(memory::DEVICE),/* right hand side */ + x->getData(memory::DEVICE),/* left hand side */ &ite_refine_succ_, &r_nrminf_, info_M_, diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp index d51218cc..37a3ffda 100644 --- a/resolve/LinSolverDirectCuSolverRf.cpp +++ b/resolve/LinSolverDirectCuSolverRf.cpp @@ -35,17 +35,17 @@ namespace ReSolve error_sum += status_cusolverrf_; status_cusolverrf_ = cusolverRfSetupDevice(n, A_->getNnzExpanded(), - A_->getRowData("cuda"), //dia_, - A_->getColData("cuda"), //dja_, - A_->getValues("cuda"), //da_, + A_->getRowData(memory::DEVICE), //dia_, + A_->getColData(memory::DEVICE), //dja_, + A_->getValues( memory::DEVICE), //da_, L->getNnz(), - L->getRowData("cuda"), - L->getColData("cuda"), - L->getValues("cuda"), + L->getRowData(memory::DEVICE), + L->getColData(memory::DEVICE), + L->getValues( memory::DEVICE), U->getNnz(), - U->getRowData("cuda"), - U->getColData("cuda"), - U->getValues("cuda"), + U->getRowData(memory::DEVICE), + U->getColData(memory::DEVICE), + U->getValues( memory::DEVICE), d_P_, d_Q_, handle_cusolverrf_); @@ -76,9 +76,9 @@ namespace ReSolve int error_sum = 0; status_cusolverrf_ = cusolverRfResetValues(A_->getNumRows(), A_->getNnzExpanded(), - A_->getRowData("cuda"), //dia_, - A_->getColData("cuda"), //dja_, - A_->getValues("cuda"), //da_, + A_->getRowData(memory::DEVICE), //dia_, + A_->getColData(memory::DEVICE), //dja_, + A_->getValues( memory::DEVICE), //da_, d_P_, d_Q_, handle_cusolverrf_); @@ -100,22 +100,22 @@ namespace ReSolve 1, d_T_, A_->getNumRows(), - rhs->getData("cuda"), + rhs->getData(memory::DEVICE), A_->getNumRows()); return status_cusolverrf_; } int LinSolverDirectCuSolverRf::solve(vector_type* rhs, vector_type* x) { - x->update(rhs->getData("cuda"), "cuda", "cuda"); - x->setDataUpdated("cuda"); + x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE); + x->setDataUpdated(memory::DEVICE); status_cusolverrf_ = cusolverRfSolve(handle_cusolverrf_, d_P_, d_Q_, 1, d_T_, A_->getNumRows(), - x->getData("cuda"), + x->getData(memory::DEVICE), A_->getNumRows()); return status_cusolverrf_; } diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp index 43b612b3..6af27d10 100644 --- a/resolve/LinSolverDirectKLU.cpp +++ b/resolve/LinSolverDirectKLU.cpp @@ -35,7 +35,7 @@ namespace ReSolve int LinSolverDirectKLU::analyze() { - Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData("cpu"), A_->getColData("cpu"), &Common_) ; + Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData(memory::HOST), A_->getColData(memory::HOST), &Common_) ; if (Symbolic_ == nullptr){ printf("Symbolic_ factorization crashed withCommon_.status = %d \n", Common_.status); @@ -46,7 +46,7 @@ namespace ReSolve int LinSolverDirectKLU::factorize() { - Numeric_ = klu_factor(A_->getRowData("cpu"), A_->getColData("cpu"),A_->getValues("cpu"), Symbolic_, &Common_); + Numeric_ = klu_factor(A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, &Common_); if (Numeric_ == nullptr){ return 1; @@ -56,7 +56,7 @@ namespace ReSolve int LinSolverDirectKLU::refactorize() { - int kluStatus = klu_refactor (A_->getRowData("cpu"), A_->getColData("cpu"), A_->getValues("cpu"), Symbolic_, Numeric_, &Common_); + int kluStatus = klu_refactor (A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, Numeric_, &Common_); if (!kluStatus){ //display error @@ -71,10 +71,10 @@ namespace ReSolve // std::memcpy(x, rhs, A->getNumRows() * sizeof(real_type)); - x->update(rhs->getData("cpu"), "cpu", "cpu"); - x->setDataUpdated("cpu"); + x->update(rhs->getData(memory::HOST), memory::HOST, memory::HOST); + x->setDataUpdated(memory::HOST); - int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData("cpu"), &Common_); + int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData(memory::HOST), &Common_); if (!kluStatus){ return 1; @@ -90,16 +90,16 @@ namespace ReSolve L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL); U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU); - L_->allocateMatrixData("cpu"); - U_->allocateMatrixData("cpu"); + L_->allocateMatrixData(memory::HOST); + U_->allocateMatrixData(memory::HOST); int ok = klu_extract(Numeric_, Symbolic_, - L_->getColData("cpu"), - L_->getRowData("cpu"), - L_->getValues("cpu"), - U_->getColData("cpu"), - U_->getRowData("cpu"), - U_->getValues("cpu"), + L_->getColData(memory::HOST), + L_->getRowData(memory::HOST), + L_->getValues( memory::HOST), + U_->getColData(memory::HOST), + U_->getRowData(memory::HOST), + U_->getValues( memory::HOST), nullptr, nullptr, nullptr, @@ -109,8 +109,8 @@ namespace ReSolve nullptr, &Common_); - L_->setUpdated("cpu"); - U_->setUpdated("cpu"); + L_->setUpdated(memory::HOST); + U_->setUpdated(memory::HOST); (void) ok; // TODO: Check status in ok before setting `factors_extracted_` factors_extracted_ = true; } @@ -125,16 +125,16 @@ namespace ReSolve L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL); U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU); - L_->allocateMatrixData("cpu"); - U_->allocateMatrixData("cpu"); + L_->allocateMatrixData(memory::HOST); + U_->allocateMatrixData(memory::HOST); int ok = klu_extract(Numeric_, Symbolic_, - L_->getColData("cpu"), - L_->getRowData("cpu"), - L_->getValues("cpu"), - U_->getColData("cpu"), - U_->getRowData("cpu"), - U_->getValues("cpu"), + L_->getColData(memory::HOST), + L_->getRowData(memory::HOST), + L_->getValues( memory::HOST), + U_->getColData(memory::HOST), + U_->getRowData(memory::HOST), + U_->getValues( memory::HOST), nullptr, nullptr, nullptr, @@ -144,8 +144,8 @@ namespace ReSolve nullptr, &Common_); - L_->setUpdated("cpu"); - U_->setUpdated("cpu"); + L_->setUpdated(memory::HOST); + U_->setUpdated(memory::HOST); (void) ok; // TODO: Check status in ok before setting `factors_extracted_` factors_extracted_ = true; diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index fa63f2d5..0bf1720f 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -82,9 +82,9 @@ namespace ReSolve n_ = A_->getNumRows(); d_V_ = new vector_type(n_, restart_ + 1); - d_V_->allocate("cuda"); + d_V_->allocate(memory::DEVICE); d_Z_ = new vector_type(n_, restart_ + 1); - d_Z_->allocate("cuda"); + d_Z_->allocate(memory::DEVICE); h_H_ = new real_type[restart_ * (restart_ + 1)]; h_c_ = new real_type[restart_]; // needed for givens h_s_ = new real_type[restart_]; // same @@ -114,7 +114,7 @@ namespace ReSolve vector_type* vec_z = new vector_type(n_); //V[0] = b-A*x_0 - rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda"); + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", "cuda"); rnorm = 0.0; bnorm = vector_handler_->dot(rhs, rhs, "cuda"); @@ -166,14 +166,14 @@ namespace ReSolve // Z_i = (LU)^{-1}*V_i - vec_v->setData( d_V_->getVectorData(i, "cuda"), "cuda"); - vec_z->setData( d_Z_->getVectorData(i, "cuda"), "cuda"); + vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE); this->precV(vec_v, vec_z); mem_.deviceSynchronize(); // V_{i+1}=A*Z_i - vec_v->setData( d_V_->getVectorData(i + 1, "cuda"), "cuda"); + vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", "cuda"); @@ -228,7 +228,7 @@ namespace ReSolve // get solution for(j = 0; j <= i; j++) { - vec_z->setData( d_Z_->getVectorData(j, "cuda"), "cuda"); + vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); vector_handler_->axpy(&h_rs_[j], vec_z, x, "cuda"); } @@ -239,7 +239,7 @@ namespace ReSolve outer_flag = 0; } - rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda"); + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", "cuda"); rnorm = vector_handler_->dot(d_V_, d_V_, "cuda"); // rnorm = ||V_1|| @@ -317,7 +317,7 @@ namespace ReSolve void LinSolverIterativeFGMRES::precV(vector_type* rhs, vector_type* x) { LU_solver_->solve(rhs, x); - // x->update(rhs->getData("cuda"), "cuda", "cuda"); + // x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE); } real_type LinSolverIterativeFGMRES::getFinalResidualNorm() diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 5e2da403..d87c621f 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -2,6 +2,16 @@ #include + +namespace ReSolve +{ + namespace memory + { + enum MemorySpace{HOST = 0, DEVICE}; + enum MemoryDirection{HOST_TO_HOST = 0, HOST_TO_DEVICE, DEVICE_TO_HOST, DEVICE_TO_DEVICE}; + } +} + namespace ReSolve { /** diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index eeff7b86..326eba59 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -27,52 +27,49 @@ namespace ReSolve { } - index_type* matrix::Coo::getRowData(std::string memspace) + index_type* matrix::Coo::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Coo::getColData(std::string memspace) + index_type* matrix::Coo::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Coo::getValues(std::string memspace) + real_type* matrix::Coo::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { //four cases (for now) @@ -80,12 +77,12 @@ namespace ReSolve if (is_expanded_) {nnz_current = nnz_expanded_;} setNotUpdated(); int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)){ control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)){ control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_row_data_ == nullptr) { this->h_row_data_ = new index_type[nnz_current]; @@ -98,7 +95,7 @@ namespace ReSolve } } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); @@ -150,7 +147,7 @@ namespace ReSolve return 0; } - index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -158,13 +155,13 @@ namespace ReSolve return i; } - index_type matrix::Coo::allocateMatrixData(std::string memspace) + index_type matrix::Coo::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_row_data_ = new index_type[nnz_current]; std::fill(h_row_data_, h_row_data_ + nnz_current, 0); this->h_col_data_ = new index_type[nnz_current]; @@ -176,7 +173,7 @@ namespace ReSolve return 0; } - if ((memspace == "cuda") || (memspace == "hip")) { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -187,55 +184,57 @@ namespace ReSolve return -1; } - int matrix::Coo::copyData(std::string memspaceOut) + int matrix::Coo::copyData(memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} - - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[nnz_current]; - } - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); - } - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + switch (memspaceOut) { + case HOST: + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[nnz_current]; + } + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + } + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } void matrix::Coo::print() diff --git a/resolve/matrix/Coo.hpp b/resolve/matrix/Coo.hpp index 3ec045c3..bc67ceef 100644 --- a/resolve/matrix/Coo.hpp +++ b/resolve/matrix/Coo.hpp @@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix { bool expanded); ~Coo(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual index_type allocateMatrixData(std::string memspace); + virtual index_type allocateMatrixData(memory::MemorySpace memspace); virtual void print(); - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; }} // namespace ReSolve::matrix diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index f6358df3..e6fed07c 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -24,64 +24,61 @@ namespace ReSolve { } - index_type* matrix::Csc::getRowData(std::string memspace) + index_type* matrix::Csc::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Csc::getColData(std::string memspace) + index_type* matrix::Csc::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Csc::getValues(std::string memspace) + real_type* matrix::Csc::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} //four cases (for now) int control=-1; setNotUpdated(); - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)) { control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_col_data_ == nullptr) { this->h_col_data_ = new index_type[n_ + 1]; @@ -94,7 +91,7 @@ namespace ReSolve } } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); @@ -147,7 +144,7 @@ namespace ReSolve } - int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -155,13 +152,13 @@ namespace ReSolve return i; } - int matrix::Csc::allocateMatrixData(std::string memspace) + int matrix::Csc::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_col_data_ = new index_type[n_ + 1]; std::fill(h_col_data_, h_col_data_ + n_ + 1, 0); this->h_row_data_ = new index_type[nnz_current]; @@ -173,7 +170,7 @@ namespace ReSolve return 0; } - if ((memspace == "cuda") || (memspace == "hip")) { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -184,54 +181,56 @@ namespace ReSolve return -1; } - int matrix::Csc::copyData(std::string memspaceOut) + int matrix::Csc::copyData(memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} - - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[n_ + 1]; - } - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); + switch(memspaceOut) { + case HOST: + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[n_ + 1]; + } + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); + } + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); - } - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } } diff --git a/resolve/matrix/Csc.hpp b/resolve/matrix/Csc.hpp index f0598314..8a5dc551 100644 --- a/resolve/matrix/Csc.hpp +++ b/resolve/matrix/Csc.hpp @@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix { bool expanded); ~Csc(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual int allocateMatrixData(std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace); virtual void print() {return;} - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index 04e8dff1..0c08b641 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -24,64 +24,61 @@ namespace ReSolve { } - index_type* matrix::Csr::getRowData(std::string memspace) + index_type* matrix::Csr::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Csr::getColData(std::string memspace) + index_type* matrix::Csr::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Csr::getValues(std::string memspace) + real_type* matrix::Csr::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { - copyData(memspace); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { //four cases (for now) index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} setNotUpdated(); int control = -1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)) { control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_row_data_ == nullptr) { this->h_row_data_ = new index_type[n_ + 1]; @@ -94,7 +91,7 @@ namespace ReSolve } } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); @@ -148,7 +145,7 @@ namespace ReSolve return 0; } - int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -156,13 +153,13 @@ namespace ReSolve return i; } - int matrix::Csr::allocateMatrixData(std::string memspace) + int matrix::Csr::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_row_data_ = new index_type[n_ + 1]; std::fill(h_row_data_, h_row_data_ + n_ + 1, 0); this->h_col_data_ = new index_type[nnz_current]; @@ -174,7 +171,7 @@ namespace ReSolve return 0; } - if ((memspace == "cuda") || (memspace == "hip")) { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -185,54 +182,58 @@ namespace ReSolve return -1; } - int matrix::Csr::copyData(std::string memspaceOut) + int matrix::Csr::copyData(memory::MemorySpace memspaceOut) { - index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} + using namespace ReSolve::memory; - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[n_ + 1]; - } - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + index_type nnz_current = nnz_; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); - } - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + switch (memspaceOut) { + case HOST: + //check if we need to copy or not + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[n_ + 1]; + } + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); + } + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } } // namespace ReSolve diff --git a/resolve/matrix/Csr.hpp b/resolve/matrix/Csr.hpp index 43c317de..a5d8f682 100644 --- a/resolve/matrix/Csr.hpp +++ b/resolve/matrix/Csr.hpp @@ -18,18 +18,18 @@ namespace ReSolve { namespace matrix { ~Csr(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual int allocateMatrixData(std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace); virtual void print() {return;} - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; }} // namespace ReSolve::matrix diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 133a09f9..0a7124da 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -124,9 +124,9 @@ namespace ReSolve { index_type* nnz_counts = new index_type[n]; std::fill_n(nnz_counts, n, 0); - index_type* coo_rows = A_coo->getRowData("cpu"); - index_type* coo_cols = A_coo->getColData("cpu"); - real_type* coo_vals = A_coo->getValues("cpu"); + index_type* coo_rows = A_coo->getRowData(memory::HOST); + index_type* coo_cols = A_coo->getColData(memory::HOST); + real_type* coo_vals = A_coo->getValues( memory::HOST); index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal std::fill_n(diag_control, n, 0); @@ -249,12 +249,12 @@ namespace ReSolve { #endif A_csr->setNnz(nnz_no_duplicates); if (memspace == "cpu"){ - A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu"); + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::HOST); } else { if (memspace == "cuda"){ - A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE); } else if (memspace == "hip"){ - A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE); } else { //display error } diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp index 2c434dcb..d4799ffd 100644 --- a/resolve/matrix/MatrixHandlerCpu.cpp +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -45,12 +45,12 @@ namespace ReSolve { // int error_sum = 0; if (matrixFormat == "csr") { matrix::Csr* A = (matrix::Csr*) Ageneric; - index_type* ia = A->getRowData("cpu"); - index_type* ja = A->getColData("cpu"); - real_type* a = A->getValues("cpu"); + index_type* ia = A->getRowData(memory::HOST); + index_type* ja = A->getColData(memory::HOST); + real_type* a = A->getValues( memory::HOST); - real_type* x_data = vec_x->getData("cpu"); - real_type* result_data = vec_result->getData("cpu"); + real_type* x_data = vec_x->getData(memory::HOST); + real_type* result_data = vec_result->getData(memory::HOST); real_type sum; real_type y; real_type t; @@ -70,7 +70,7 @@ namespace ReSolve { sum *= (*alpha); result_data[i] = result_data[i]*(*beta) + sum; } - vec_result->setDataUpdated("cpu"); + vec_result->setDataUpdated(memory::HOST); return 0; } else { out::error() << "MatVec not implemented (yet) for " @@ -100,13 +100,13 @@ namespace ReSolve { index_type nnz = A_csc->getNnz(); index_type n = A_csc->getNumColumns(); - index_type* rowIdxCsc = A_csc->getRowData("cpu"); - index_type* colPtrCsc = A_csc->getColData("cpu"); - real_type* valuesCsc = A_csc->getValues("cpu"); + index_type* rowIdxCsc = A_csc->getRowData(memory::HOST); + index_type* colPtrCsc = A_csc->getColData(memory::HOST); + real_type* valuesCsc = A_csc->getValues( memory::HOST); - index_type* rowPtrCsr = A_csr->getRowData("cpu"); - index_type* colIdxCsr = A_csr->getColData("cpu"); - real_type* valuesCsr = A_csr->getValues("cpu"); + index_type* rowPtrCsr = A_csr->getRowData(memory::HOST); + index_type* colIdxCsr = A_csr->getColData(memory::HOST); + real_type* valuesCsr = A_csr->getValues( memory::HOST); // Set all CSR row pointers to zero for (index_type i = 0; i <= n; ++i) { diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index 3405ba8d..e0ac7bb4 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -42,11 +42,11 @@ namespace ReSolve { cusparseStatus_t status; LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX(); - cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F); + cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData(memory::DEVICE), CUDA_R_64F); cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY(); - cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F); + cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData(memory::DEVICE), CUDA_R_64F); cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor(); @@ -57,9 +57,9 @@ namespace ReSolve { A->getNumRows(), A->getNumColumns(), A->getNnzExpanded(), - A->getRowData("cuda"), - A->getColData("cuda"), - A->getValues("cuda"), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), + A->getValues( memory::DEVICE), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, @@ -105,7 +105,7 @@ namespace ReSolve { if (status) out::error() << "Matvec status: " << status << "Last error code: " << mem_.getLastDeviceError() << std::endl; - vec_result->setDataUpdated("cuda"); + vec_result->setDataUpdated(memory::DEVICE); cusparseDestroyDnVec(vecx); cusparseDestroyDnVec(vecAx); @@ -127,7 +127,7 @@ namespace ReSolve { index_type error_sum = 0; LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - A_csr->allocateMatrixData("cuda"); + A_csr->allocateMatrixData(memory::DEVICE); index_type n = A_csc->getNumRows(); index_type m = A_csc->getNumRows(); index_type nnz = A_csc->getNnz(); @@ -137,12 +137,12 @@ namespace ReSolve { n, m, nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, @@ -154,12 +154,12 @@ namespace ReSolve { n, m, nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index 370849fa..b4f8e483 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -62,9 +62,9 @@ namespace ReSolve { A->getNumColumns(), A->getNnzExpanded(), descrA, - A->getValues("cuda"), - A->getRowData("cuda"), - A->getColData("cuda"), // cuda is used as "device" + A->getValues( memory::DEVICE), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), // cuda is used as "device" infoA); error_sum += status; mem_.deviceSynchronize(); @@ -79,20 +79,20 @@ namespace ReSolve { A->getNnzExpanded(), alpha, descrA, - A->getValues("cuda"), - A->getRowData("cuda"), - A->getColData("cuda"), + A->getValues( memory::DEVICE), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), infoA, - vec_x->getData("cuda"), + vec_x->getData(memory::DEVICE), beta, - vec_result->getData("cuda")); + vec_result->getData(memory::DEVICE)); error_sum += status; mem_.deviceSynchronize(); if (status) out::error() << "Matvec status: " << status << "Last error code: " << mem_.getLastDeviceError() << std::endl; - vec_result->setDataUpdated("cuda"); + vec_result->setDataUpdated(memory::DEVICE); return error_sum; } else { @@ -114,7 +114,7 @@ namespace ReSolve { rocsparse_status status; - A_csr->allocateMatrixData("cuda"); + A_csr->allocateMatrixData(memory::DEVICE); index_type n = A_csc->getNumRows(); index_type m = A_csc->getNumRows(); index_type nnz = A_csc->getNnz(); @@ -125,8 +125,8 @@ namespace ReSolve { n, m, nnz, - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), rocsparse_action_numeric, &bufferSize); @@ -137,12 +137,12 @@ namespace ReSolve { n, m, nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), rocsparse_action_numeric, rocsparse_index_base_zero, d_work); diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp index 4a16ec98..faa86e11 100644 --- a/resolve/matrix/Sparse.cpp +++ b/resolve/matrix/Sparse.cpp @@ -73,8 +73,8 @@ namespace ReSolve { namespace matrix { Sparse::~Sparse() { - this->destroyMatrixData("cpu"); - this->destroyMatrixData("cuda"); + this->destroyMatrixData(memory::HOST); + this->destroyMatrixData(memory::DEVICE); } void Sparse::setNotUpdated() @@ -133,58 +133,59 @@ namespace ReSolve { namespace matrix { this->nnz_ = nnz_new; } - int Sparse::setUpdated(std::string what) + int Sparse::setUpdated(memory::MemorySpace memspace) { - if (what == "cpu") - { - h_data_updated_ = true; - d_data_updated_ = false; - } else { - if (what == "cuda"){ + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + h_data_updated_ = true; + d_data_updated_ = false; + break; + case DEVICE: d_data_updated_ = true; h_data_updated_ = false; - } else { - return -1; - } + break; } return 0; } - int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace) + int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace) { + using namespace ReSolve::memory; setNotUpdated(); - if (memspace == "cpu"){ - this->h_row_data_ = row_data; - this->h_col_data_ = col_data; - this->h_val_data_ = val_data; - h_data_updated_ = true; - } else { - if (memspace == "cuda"){ + switch (memspace) { + case HOST: + this->h_row_data_ = row_data; + this->h_col_data_ = col_data; + this->h_val_data_ = val_data; + h_data_updated_ = true; + break; + case DEVICE: this->d_row_data_ = row_data; this->d_col_data_ = col_data; this->d_val_data_ = val_data; d_data_updated_ = true; - } else { - return -1; - } + break; } return 0; } - int Sparse::destroyMatrixData(std::string memspace) - { - if (memspace == "cpu"){ - if (owns_cpu_data_) { - delete [] h_row_data_; - delete [] h_col_data_; - } - if (owns_cpu_vals_) { - delete [] h_val_data_; - } - } else { - if (memspace == "cuda"){ + int Sparse::destroyMatrixData(memory::MemorySpace memspace) + { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (owns_cpu_data_) { + delete [] h_row_data_; + delete [] h_col_data_; + } + if (owns_cpu_vals_) { + delete [] h_val_data_; + } + return 0; + case DEVICE: if (owns_gpu_data_) { mem_.deleteOnDevice(d_row_data_); mem_.deleteOnDevice(d_col_data_); @@ -192,14 +193,13 @@ namespace ReSolve { namespace matrix { if (owns_gpu_vals_) { mem_.deleteOnDevice(d_val_data_); } - } else { + return 0; + default: return -1; - } } - return 0; } - int Sparse::updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut) + int Sparse::updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { index_type nnz_current = nnz_; @@ -207,19 +207,19 @@ namespace ReSolve { namespace matrix { //four cases (for now) setNotUpdated(); int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 1;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 2;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_val_data_ == nullptr) { this->h_val_data_ = new real_type[nnz_current]; } } - if (memspaceOut == "cuda") { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_val_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -253,21 +253,22 @@ namespace ReSolve { namespace matrix { return 0; } - int Sparse::setNewValues(real_type* new_vals, std::string memspace) + int Sparse::setNewValues(real_type* new_vals, memory::MemorySpace memspace) { - + using namespace ReSolve::memory; setNotUpdated(); - if (memspace == "cpu"){ - this->h_val_data_ = new_vals; - h_data_updated_ = true; - } else { - if (memspace == "cuda"){ + switch (memspace) { + case HOST: + this->h_val_data_ = new_vals; + h_data_updated_ = true; + break; + case DEVICE: this->d_val_data_ = new_vals; d_data_updated_ = true; - } else { + break; + default: return -1; - } } return 0; } diff --git a/resolve/matrix/Sparse.hpp b/resolve/matrix/Sparse.hpp index 1196c38e..96121acb 100644 --- a/resolve/matrix/Sparse.hpp +++ b/resolve/matrix/Sparse.hpp @@ -31,31 +31,31 @@ namespace ReSolve { namespace matrix { void setExpanded(bool expanded); void setNnzExpanded(index_type nnz_expanded_new); void setNnz(index_type nnz_new); // for resetting when removing duplicates - index_type setUpdated(std::string what); + index_type setUpdated(memory::MemorySpace what); - virtual index_type* getRowData(std::string memspace) = 0; - virtual index_type* getColData(std::string memspace) = 0; - virtual real_type* getValues(std::string memspace) = 0; + virtual index_type* getRowData(memory::MemorySpace memspace) = 0; + virtual index_type* getColData(memory::MemorySpace memspace) = 0; + virtual real_type* getValues( memory::MemorySpace memspace) = 0; - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) = 0; - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) = 0; + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0; + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0; - virtual int allocateMatrixData(std::string memspace) = 0; - int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace) = 0; + int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace); - int destroyMatrixData(std::string memspace); + int destroyMatrixData(memory::MemorySpace memspace); virtual void print() = 0; - virtual int copyData(std::string memspaceOut) = 0; + virtual int copyData(memory::MemorySpace memspaceOut) = 0; //update Values just updates values; it allocates if necessary. //values have the same dimensions between different formats - virtual int updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut); + virtual int updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); //set new values just sets the pointer, use caution. - virtual int setNewValues(real_type* new_vals, std::string memspace); + virtual int setNewValues(real_type* new_vals, memory::MemorySpace memspace); protected: //size diff --git a/resolve/matrix/io.cpp b/resolve/matrix/io.cpp index 36fb5f1b..0d96a5e1 100644 --- a/resolve/matrix/io.cpp +++ b/resolve/matrix/io.cpp @@ -53,7 +53,7 @@ namespace ReSolve { namespace io { coo_vals[i] = c; i++; } - A->setMatrixData(coo_rows, coo_cols, coo_vals, "cpu"); + A->setMatrixData(coo_rows, coo_cols, coo_vals, memory::HOST); return A; } @@ -116,9 +116,9 @@ namespace ReSolve { namespace io { } A->setNnz(nnz); //create coo arrays - index_type* coo_rows = A->getRowData("cpu"); - index_type* coo_cols = A->getColData("cpu"); - real_type* coo_vals = A->getValues("cpu"); + index_type* coo_rows = A->getRowData(memory::HOST); + index_type* coo_cols = A->getColData(memory::HOST); + real_type* coo_vals = A->getValues( memory::HOST); i = 0; index_type a, b; real_type c; @@ -171,7 +171,7 @@ namespace ReSolve { namespace io { int writeVectorToFile(vector_type* vec_x, std::ostream& file_out) { - real_type* x_data = vec_x->getData("cpu"); + real_type* x_data = vec_x->getData(memory::HOST); // std::ofstream file_out (filename, std::ofstream::out); file_out << "%%MatrixMarket matrix array real general \n"; file_out << "% ID: XXX \n"; diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index df3c475d..0a62bd02 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -52,52 +52,51 @@ namespace ReSolve { namespace vector { return k_; } - void Vector::setData(real_type* data, std::string memspace) + void Vector::setData(real_type* data, memory::MemorySpace memspace) { - - if (memspace == "cpu") { - h_data_ = data; - cpu_updated_ = true; - gpu_updated_ = false; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + h_data_ = data; + cpu_updated_ = true; + gpu_updated_ = false; + break; + case DEVICE: d_data_ = data; gpu_updated_ = true; cpu_updated_ = false; - } else { - //error - } + break; } } - void Vector::setDataUpdated(std::string memspace) + void Vector::setDataUpdated(memory::MemorySpace memspace) { - if (memspace == "cpu") { - cpu_updated_ = true; - gpu_updated_ = false; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + cpu_updated_ = true; + gpu_updated_ = false; + break; + case DEVICE: gpu_updated_ = true; cpu_updated_ = false; - } else { - //error - } + break; } } - int Vector::update(real_type* data, std::string memspaceIn, std::string memspaceOut) + int Vector::update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 1;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 2;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;} - if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ + if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) { //allocate first h_data_ = new real_type[n_ * k_]; } - if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ + if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -133,26 +132,26 @@ namespace ReSolve { namespace vector { return 0; } - real_type* Vector::getData(std::string memspace) + real_type* Vector::getData(memory::MemorySpace memspace) { return this->getData(0, memspace); } - real_type* Vector::getData(index_type i, std::string memspace) + real_type* Vector::getData(index_type i, memory::MemorySpace memspace) { - if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) { - copyData(memspace, "cpu"); + if ((memspace == memory::HOST) && (cpu_updated_ == false) && (gpu_updated_ == true )) { + copyData(memspace, memory::HOST); owns_cpu_data_ = true; } - if (((memspace == "cuda") || (memspace == "hip")) && (gpu_updated_ == false) && (cpu_updated_ == true )) { - copyData("cpu", memspace); + if ((memspace == memory::DEVICE) && (gpu_updated_ == false) && (cpu_updated_ == true )) { + copyData(memory::HOST, memspace); owns_gpu_data_ = true; } - if (memspace == "cpu") { + if (memspace == memory::HOST) { return &h_data_[i * n_current_]; } else { - if ((memspace == "cuda") || (memspace == "hip")){ + if (memspace == memory::DEVICE){ return &d_data_[i * n_current_]; } else { return nullptr; @@ -161,17 +160,17 @@ namespace ReSolve { namespace vector { } - int Vector::copyData(std::string memspaceIn, std::string memspaceOut) + int Vector::copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 0;} - if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 1;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 0;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 1;} - if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ + if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) { //allocate first h_data_ = new real_type[n_ * k_]; } - if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ + if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -193,110 +192,118 @@ namespace ReSolve { namespace vector { return 0; } - void Vector::allocate(std::string memspace) + void Vector::allocate(memory::MemorySpace memspace) { - if (memspace == "cpu") { - delete [] h_data_; - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } else { - if ((memspace == "cuda") || (memspace == "hip")) { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + delete [] h_data_; + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + break; + case DEVICE: mem_.deleteOnDevice(d_data_); mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; - } else { - std::cout<<"wrong memspace " <k_ < i){ return nullptr; @@ -315,38 +322,38 @@ namespace ReSolve { namespace vector { } } - int Vector::deepCopyVectorData(real_type* dest, index_type i, std::string memspaceOut) + int Vector::deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; if (i > this->k_) { return -1; } else { real_type* data = this->getData(i, memspaceOut); - if (memspaceOut == "cpu") { - mem_.copyArrayHostToHost(dest, data, n_current_); - } else { - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { + switch (memspaceOut) { + case HOST: + mem_.copyArrayHostToHost(dest, data, n_current_); + break; + case DEVICE: mem_.copyArrayDeviceToDevice(dest, data, n_current_); - } else { - //error - } + break; } return 0; } } - int Vector::deepCopyVectorData(real_type* dest, std::string memspaceOut) + int Vector::deepCopyVectorData(real_type* dest, memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; real_type* data = this->getData(memspaceOut); - if (memspaceOut == "cpu") { - mem_.copyArrayHostToHost(dest, data, n_current_ * k_); - } else { - if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { + switch (memspaceOut) { + case HOST: + mem_.copyArrayHostToHost(dest, data, n_current_ * k_); + break; + case DEVICE: mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_); - } else { - //error - } + break; } return 0; - } + }} // namespace ReSolve::vector diff --git a/resolve/vector/Vector.hpp b/resolve/vector/Vector.hpp index 9d1bd452..5f86ef7f 100644 --- a/resolve/vector/Vector.hpp +++ b/resolve/vector/Vector.hpp @@ -11,26 +11,26 @@ namespace ReSolve { namespace vector { Vector(index_type n, index_type k); ~Vector(); - int update(real_type* data, std::string memspaceIn, std::string memspaceOut); - real_type* getData(std::string memspace); - real_type* getData(index_type i, std::string memspace); // get pointer to i-th vector in multivector + int update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + real_type* getData(memory::MemorySpace memspace); + real_type* getData(index_type i, memory::MemorySpace memspace); // get pointer to i-th vector in multivector index_type getSize(); index_type getCurrentSize(); index_type getNumVectors(); - void setDataUpdated(std::string memspace); - void setData(real_type* data, std::string memspace); - void allocate(std::string memspace); - void setToZero(std::string memspace); - void setToZero(index_type i, std::string memspace); // set i-th ivector to 0 - void setToConst(real_type C, std::string memspace); - void setToConst(index_type i, real_type C, std::string memspace); // set i-th vector to C - needed for unit tests, Gram Schmidt tests - int copyData(std::string memspaceIn, std::string memspaceOut); + void setDataUpdated(memory::MemorySpace memspace); + void setData(real_type* data, memory::MemorySpace memspace); + void allocate(memory::MemorySpace memspace); + void setToZero(memory::MemorySpace memspace); + void setToZero(index_type i, memory::MemorySpace memspace); // set i-th ivector to 0 + void setToConst(real_type C, memory::MemorySpace memspace); + void setToConst(index_type i, real_type C, memory::MemorySpace memspace); // set i-th vector to C - needed for unit tests, Gram Schmidt tests + int copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); int setCurrentSize(index_type new_n_current); - real_type* getVectorData(index_type i, std::string memspace); // get ith vector data out of multivector - int deepCopyVectorData(real_type* dest, index_type i, std::string memspace); - int deepCopyVectorData(real_type* dest, std::string memspace); //copy FULL multivector + real_type* getVectorData(index_type i, memory::MemorySpace memspace); // get ith vector data out of multivector + int deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspace); + int deepCopyVectorData(real_type* dest, memory::MemorySpace memspace); //copy FULL multivector private: index_type n_; ///< size diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp index f5cc463d..a8317a89 100644 --- a/resolve/vector/VectorHandlerCpu.cpp +++ b/resolve/vector/VectorHandlerCpu.cpp @@ -47,8 +47,8 @@ namespace ReSolve { real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y) { - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); + real_type* y_data = y->getData(memory::HOST); real_type sum = 0.0; real_type c = 0.0; // real_type t, y; @@ -72,7 +72,7 @@ namespace ReSolve { */ void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x) { - real_type* x_data = x->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); for (int i = 0; i < x->getSize(); ++i){ x_data[i] *= (*alpha); @@ -91,8 +91,8 @@ namespace ReSolve { void VectorHandlerCpu::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) { //AXPY: y = alpha * x + y - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); + real_type* y_data = y->getData(memory::HOST); for (int i = 0; i < x->getSize(); ++i) { y_data[i] = (*alpha) * x_data[i] + y_data[i]; } diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp index 3c887e85..5871fd5a 100644 --- a/resolve/vector/VectorHandlerCuda.cpp +++ b/resolve/vector/VectorHandlerCuda.cpp @@ -50,7 +50,7 @@ namespace ReSolve { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); double nrm = 0.0; - cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm); + cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm); if (st!=0) {printf("dot product crashed with code %d \n", st);} return nrm; } @@ -67,7 +67,7 @@ namespace ReSolve { { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1); + cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData(memory::DEVICE), 1); if (st!=0) { ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; } @@ -90,9 +90,9 @@ namespace ReSolve { cublasDaxpy(handle_cublas, x->getSize(), alpha, - x->getData("cuda"), + x->getData(memory::DEVICE), 1, - y->getData("cuda"), + y->getData(memory::DEVICE), 1); } @@ -131,12 +131,12 @@ namespace ReSolve { n, k, alpha, - V->getData("cuda"), + V->getData(memory::DEVICE), n, - y->getData("cuda"), + y->getData(memory::DEVICE), 1, beta, - x->getData("cuda"), + x->getData(memory::DEVICE), 1); } else { @@ -145,12 +145,12 @@ namespace ReSolve { n, k, alpha, - V->getData("cuda"), + V->getData(memory::DEVICE), n, - y->getData("cuda"), + y->getData(memory::DEVICE), 1, beta, - x->getData("cuda"), + x->getData(memory::DEVICE), 1); } } @@ -171,7 +171,7 @@ namespace ReSolve { { using namespace constants; if (k < 200) { - mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda")); + mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE)); } else { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); @@ -182,12 +182,12 @@ namespace ReSolve { 1, // n k + 1, // k &MINUSONE, // alpha - x->getData("cuda"), // A + x->getData(memory::DEVICE), // A size, // lda - alpha->getData("cuda"), // B + alpha->getData(memory::DEVICE), // B k + 1, // ldb &ONE, - y->getData("cuda"), // c + y->getData(memory::DEVICE), // c size); // ldc } } @@ -212,7 +212,7 @@ namespace ReSolve { using namespace constants; if (k < 200) { - mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda")); + mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE)); } else { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); @@ -223,12 +223,12 @@ namespace ReSolve { 2, //n size, //k &ONE, //alpha - V->getData("cuda"), //A + V->getData(memory::DEVICE), //A size, //lda - x->getData("cuda"), //B + x->getData(memory::DEVICE), //B size, //ldb &ZERO, - res->getData("cuda"), //c + res->getData(memory::DEVICE), //c k + 1); //ldc } } diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp index 9f2927c7..1e1195fc 100644 --- a/resolve/vector/VectorHandlerHip.cpp +++ b/resolve/vector/VectorHandlerHip.cpp @@ -50,7 +50,7 @@ namespace ReSolve { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); double nrm = 0.0; - rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData("hip"), 1, y->getData("hip"), 1, &nrm); + rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm); if (st!=0) {printf("dot product crashed with code %d \n", st);} return nrm; } @@ -67,7 +67,7 @@ namespace ReSolve { { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); - rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData("hip"), 1); + rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData(memory::DEVICE), 1); if (st!=0) { ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; } @@ -90,9 +90,9 @@ namespace ReSolve { rocblas_daxpy(handle_rocblas, x->getSize(), alpha, - x->getData("hip"), + x->getData(memory::DEVICE), 1, - y->getData("hip"), + y->getData(memory::DEVICE), 1); } @@ -131,12 +131,12 @@ namespace ReSolve { n, k, alpha, - V->getData("hip"), + V->getData(memory::DEVICE), n, - y->getData("hip"), + y->getData(memory::DEVICE), 1, beta, - x->getData("hip"), + x->getData(memory::DEVICE), 1); } else { @@ -145,12 +145,12 @@ namespace ReSolve { n, k, alpha, - V->getData("hip"), + V->getData(memory::DEVICE), n, - y->getData("hip"), + y->getData(memory::DEVICE), 1, beta, - x->getData("hip"), + x->getData(memory::DEVICE), 1); } } @@ -171,7 +171,7 @@ namespace ReSolve { { using namespace constants; if (k < 200) { - mass_axpy(size, k, x->getData("hip"), y->getData("hip"),alpha->getData("hip")); + mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE)); } else { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); @@ -182,12 +182,12 @@ namespace ReSolve { 1, // n k, // k &MINUSONE, // alpha - x->getData("hip"), // A + x->getData(memory::DEVICE), // A size, // lda - alpha->getData("hip"), // B + alpha->getData(memory::DEVICE), // B k, // ldb &ONE, - y->getData("hip"), // c + y->getData(memory::DEVICE), // c size); // ldc } } @@ -212,7 +212,7 @@ namespace ReSolve { using namespace constants; if (k < 200) { - mass_inner_product_two_vectors(size, k, x->getData("hip") , x->getData(1, "hip"), V->getData("hip"), res->getData("hip")); + mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE)); } else { LinAlgWorkspaceHIP* workspaceHIP = workspace_; rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); @@ -223,12 +223,12 @@ namespace ReSolve { 2, //n size, //k &ONE, //alpha - V->getData("hip"), //A + V->getData(memory::DEVICE), //A size, //lda - x->getData("hip"), //B + x->getData(memory::DEVICE), //B size, //ldb &ZERO, - res->getData("hip"), //c + res->getData(memory::DEVICE), //c k + 1); //ldc } } diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp index b067f417..083c11d1 100644 --- a/tests/functionality/testKLU.cpp +++ b/tests/functionality/testKLU.cpp @@ -74,8 +74,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -100,11 +100,11 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cpu"); - vec_diff->update(x_data, "cpu", "cpu"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST); - // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu")); + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::HOST)); matrix_handler->setValuesChanged(true, "cpu"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); error_sum += status; @@ -123,13 +123,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); // and solve it too status = KLU->refactorize(); @@ -174,7 +174,7 @@ int main(int argc, char *argv[]) status = KLU->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); @@ -185,13 +185,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cpu"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp index ddaf3b31..702141ec 100644 --- a/tests/functionality/testKLU_GLU.cpp +++ b/tests/functionality/testKLU_GLU.cpp @@ -75,15 +75,15 @@ int main(int argc, char *argv[]) real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vector_type* vec_r = new vector_type(A->getNumRows()); rhs1_file.close(); // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) status = GLU->setup(A, L, U, P, Q); error_sum += status; std::cout<<"GLU setup status: "<update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = GLU->solve(vec_rhs, vec_x); error_sum += status; std::cout<<"GLU solve status: "<setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); matrix_handler->setValuesChanged(true, "cuda"); @@ -145,13 +145,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_x->update(vec_x->getData("cuda"), "cuda", "cpu"); + vec_x->update(vec_x->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -188,7 +188,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = GLU->refactorize(); error_sum += status; @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) status = GLU->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); @@ -208,13 +208,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp index 124f07de..a136017e 100644 --- a/tests/functionality/testKLU_Rf.cpp +++ b/tests/functionality/testKLU_Rf.cpp @@ -80,8 +80,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -106,9 +106,9 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); matrix_handler->setValuesChanged(true, "cuda"); @@ -129,13 +129,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -186,7 +186,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->refactorize(); error_sum += status; @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) status = Rf->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); @@ -205,13 +205,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp index 6601a3ee..6a81dac1 100644 --- a/tests/functionality/testKLU_Rf_FGMRES.cpp +++ b/tests/functionality/testKLU_Rf_FGMRES.cpp @@ -85,8 +85,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -112,11 +112,11 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); - // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE)); matrix_handler->setValuesChanged(true, "cuda"); //evaluate the residual ||b-Ax|| status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); @@ -136,13 +136,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -202,13 +202,13 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); Rf->setNumericalProperties(1e-12, 1e-1); status = Rf->refactorize(); error_sum += status; - vec_x->update(rhs, "cpu", "cuda"); + vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->solve(vec_x); error_sum += status; @@ -216,11 +216,11 @@ int main(int argc, char *argv[]) status = FGMRES->setupPreconditioner("CuSolverRf", Rf); error_sum += status; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = FGMRES->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); //evaluate final residual @@ -233,13 +233,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index 0bcfe544..63d2f49b 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -42,18 +42,23 @@ class MatrixHandlerTests : TestBase TestOutcome matVec(index_type N) { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; ReSolve::MatrixHandler* handler = createMatrixHandler(); matrix::Csr* A = createCsrMatrix(N, memspace_); vector::Vector x(N); vector::Vector y(N); - x.allocate(memspace_); - if (x.getData(memspace_) == NULL) printf("oups we have an issue \n"); - y.allocate(memspace_); + x.allocate(ms); + if (x.getData(ms) == NULL) printf("oups we have an issue \n"); + y.allocate(ms); - x.setToConst(1.0, memspace_); - y.setToConst(1.0, memspace_); + x.setToConst(1.0, ms); + y.setToConst(1.0, ms); real_type alpha = 2.0/30.0; real_type beta = 2.0; @@ -98,14 +103,14 @@ class MatrixHandlerTests : TestBase { bool status = true; if (memspace != "cpu") { - x.copyData(memspace, "cpu"); + x.copyData(memory::DEVICE, memory::HOST); } for (index_type i = 0; i < x.getSize(); ++i) { - // std::cout << x.getData("cpu")[i] << "\n"; - if (!isEqual(x.getData("cpu")[i], answer)) { + // std::cout << x.getData(memory::HOST)[i] << "\n"; + if (!isEqual(x.getData(memory::HOST)[i], answer)) { status = false; - std::cout << "Solution vector element x[" << i << "] = " << x.getData("cpu")[i] + std::cout << "Solution vector element x[" << i << "] = " << x.getData(memory::HOST)[i] << ", expected: " << answer << "\n"; break; } @@ -135,11 +140,11 @@ class MatrixHandlerTests : TestBase // Allocate NxN CSR matrix with NNZ nonzeros matrix::Csr* A = new matrix::Csr(N, N, NNZ); - A->allocateMatrixData("cpu"); + A->allocateMatrixData(memory::HOST); - index_type* rowptr = A->getRowData("cpu"); - index_type* colidx = A->getColData("cpu"); - real_type* val = A->getValues("cpu"); + index_type* rowptr = A->getRowData(memory::HOST); + index_type* colidx = A->getColData(memory::HOST); + real_type* val = A->getValues( memory::HOST); // Populate CSR matrix using same row pattern as for NNZ calculation rowptr[0] = 0; @@ -157,10 +162,10 @@ class MatrixHandlerTests : TestBase val[j] = row_sample[static_cast(j - rowptr[i])]; } } - A->setUpdated("cpu"); + A->setUpdated(memory::HOST); if ((memspace == "cuda") || (memspace == "hip")) { - A->copyData(memspace); + A->copyData(memory::DEVICE); } return A; diff --git a/tests/unit/matrix/MatrixIoTests.hpp b/tests/unit/matrix/MatrixIoTests.hpp index ad14f0a7..1ce23ae2 100644 --- a/tests/unit/matrix/MatrixIoTests.hpp +++ b/tests/unit/matrix/MatrixIoTests.hpp @@ -78,7 +78,7 @@ class MatrixIoTests : TestBase // Create a 5x5 COO matrix with 10 nonzeros ReSolve::matrix::Coo A(5, 5, 10); - A.allocateMatrixData("cpu"); + A.allocateMatrixData(memory::HOST); // Read string into istream and status it to `readMatrixFromFile` function. std::istringstream file2(symmetric_coo_matrix_file_); @@ -176,9 +176,9 @@ class MatrixIoTests : TestBase const std::vector& val_data) { for (size_t i = 0; i < val_data.size(); ++i) { - if ((answer.getRowData("cpu")[i] != row_data[i]) || - (answer.getColData("cpu")[i] != col_data[i]) || - (!isEqual(answer.getValues("cpu")[i], val_data[i]))) + if ((answer.getRowData(memory::HOST)[i] != row_data[i]) || + (answer.getColData(memory::HOST)[i] != col_data[i]) || + (!isEqual(answer.getValues(memory::HOST)[i], val_data[i]))) { std::cout << "Incorrect matrix value at storage element " << i << ".\n"; return false; diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp index 9981ea48..4837b57b 100644 --- a/tests/unit/vector/GramSchmidtTests.hpp +++ b/tests/unit/vector/GramSchmidtTests.hpp @@ -66,15 +66,21 @@ namespace ReSolve { break; } + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2 real_type* aux_data; // needed for setup - V->allocate(memspace_); - if (memspace_ != "cpu") { - V->allocate("cpu"); + V->allocate(ms); + if (ms != memory::HOST) { + V->allocate(memory::HOST); } @@ -82,7 +88,7 @@ namespace ReSolve { GS->setup(N, 3); //fill 2nd and 3rd vector with values - aux_data = V->getVectorData(1, "cpu"); + aux_data = V->getVectorData(1, memory::HOST); for (int i = 0; i < N; ++i) { if ( i % 2 == 0) { aux_data[i] = constants::ONE; @@ -90,7 +96,7 @@ namespace ReSolve { aux_data[i] = var1; } } - aux_data = V->getVectorData(2, "cpu"); + aux_data = V->getVectorData(2, memory::HOST); for (int i = 0; i < N; ++i) { if ( i % 3 > 0) { aux_data[i] = constants::ZERO; @@ -98,11 +104,11 @@ namespace ReSolve { aux_data[i] = var2; } } - V->setDataUpdated("cpu"); - V->copyData("cpu", memspace_); + V->setDataUpdated(memory::HOST); + V->copyData(memory::HOST, ms); //set the first vector to all 1s, normalize - V->setToConst(0, 1.0, memspace_); + V->setToConst(0, 1.0, ms); real_type nrm = handler->dot(V, V, memspace_); nrm = sqrt(nrm); nrm = 1.0 / nrm; @@ -144,6 +150,12 @@ namespace ReSolve { // x is a multivector containing K vectors bool verifyAnswer(vector::Vector* x, index_type K, ReSolve::VectorHandler* handler, std::string memspace) { + ReSolve::memory::MemorySpace ms; + if (memspace == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + vector::Vector* a = new vector::Vector(x->getSize()); vector::Vector* b = new vector::Vector(x->getSize()); @@ -152,8 +164,8 @@ namespace ReSolve { for (index_type i = 0; i < K; ++i) { for (index_type j = 0; j < K; ++j) { - a->update(x->getVectorData(i, memspace), memspace, "cpu"); - b->update(x->getVectorData(j, memspace), memspace, "cpu"); + a->update(x->getVectorData(i, ms), ms, memory::HOST); + b->update(x->getVectorData(j, ms), ms, memory::HOST); ip = handler->dot(a, b, "cpu"); if ( (i != j) && (abs(ip) > 1e-14)) { diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp index 60020ec5..856bb84d 100644 --- a/tests/unit/vector/VectorHandlerTests.hpp +++ b/tests/unit/vector/VectorHandlerTests.hpp @@ -39,16 +39,22 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); - x->allocate(memspace_); - y->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); - x->setToConst(3.0, memspace_); - y->setToConst(1.0, memspace_); + x->setToConst(3.0, ms); + y->setToConst(1.0, ms); real_type alpha = 0.5; //the result is a vector with y[i] = 2.5; @@ -66,16 +72,22 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); - x->allocate(memspace_); - y->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); - x->setToConst(0.25, memspace_); - y->setToConst(4.0, memspace_); + x->setToConst(0.25, ms); + y->setToConst(4.0, ms); real_type ans; //the result is N ans = handler->dot(x, y, memspace_); @@ -98,13 +110,19 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); - x->allocate(memspace_); + x->allocate(ms); - x->setToConst(1.25, memspace_); + x->setToConst(1.25, ms); real_type alpha = 3.5; @@ -122,17 +140,23 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N); vector::Vector* alpha = new vector::Vector(K);; - x->allocate(memspace_); - y->allocate(memspace_); - alpha->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); + alpha->allocate(ms); - y->setToConst(2.0, memspace_); - alpha->setToConst(-1.0, memspace_); + y->setToConst(2.0, ms); + alpha->setToConst(-1.0, ms); for (int ii = 0; ii < K; ++ii) { real_type c; if (ii % 2 == 0) { @@ -140,7 +164,7 @@ namespace ReSolve { } else { c = 0.5; } - x->setToConst(ii, c, memspace_); + x->setToConst(ii, c, ms); } index_type r = K % 2; @@ -161,17 +185,23 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N, 2); vector::Vector* res = new vector::Vector(K, 2); - x->allocate(memspace_); - y->allocate(memspace_); - res->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); + res->allocate(ms); - x->setToConst(1.0, memspace_); - y->setToConst(-1.0, memspace_); + x->setToConst(1.0, ms); + y->setToConst(-1.0, ms); handler->massDot2Vec(N, x, K, y, res, memspace_); status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_); @@ -186,6 +216,13 @@ namespace ReSolve { TestOutcome gemv(index_type N, index_type K) { TestStatus status; + + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, K); // for the test with NO TRANSPOSE @@ -195,17 +232,17 @@ namespace ReSolve { vector::Vector* yT = new vector::Vector(N); vector::Vector* xT = new vector::Vector(K); - V->allocate(memspace_); - yN->allocate(memspace_); - xN->allocate(memspace_); - yT->allocate(memspace_); - xT->allocate(memspace_); - - V->setToConst(1.0, memspace_); - yN->setToConst(-1.0, memspace_); - xN->setToConst(.5, memspace_); - yT->setToConst(-1.0, memspace_); - xT->setToConst(.5, memspace_); + V->allocate(ms); + yN->allocate(ms); + xN->allocate(ms); + yT->allocate(ms); + xT->allocate(ms); + + V->setToConst(1.0, ms); + yN->setToConst(-1.0, ms); + xN->setToConst(.5, ms); + yT->setToConst(-1.0, ms); + xT->setToConst(.5, ms); real_type alpha = -1.0; real_type beta = 1.0; @@ -248,15 +285,15 @@ namespace ReSolve { { bool status = true; if (memspace != "cpu") { - x->copyData(memspace, "cpu"); + x->copyData(memory::DEVICE, memory::HOST); } for (index_type i = 0; i < x->getSize(); ++i) { // std::cout << x->getData("cpu")[i] << "\n"; - if (!isEqual(x->getData("cpu")[i], answer)) { + if (!isEqual(x->getData(memory::HOST)[i], answer)) { std::cout << std::setprecision(16); status = false; - std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i] + std::cout << "Solution vector element x[" << i << "] = " << x->getData(memory::HOST)[i] << ", expected: " << answer << "\n"; break; } From 9a5fd7adba72170f40a0e91c87171dc9c1ae3659 Mon Sep 17 00:00:00 2001 From: pelesh Date: Wed, 1 Nov 2023 13:51:40 -0400 Subject: [PATCH 05/12] Review CUDA and HIP configuration in CMake (#48) * Add first pass CMake cleanup - need to fix include_directories and add custom library macro. * Fix find HIP cmake function. * Update CMakePresets.json Co-authored-by: Nicholson Koukpaizan <72402802+nkoukpaizan@users.noreply.github.com> --------- Co-authored-by: rcrutherford Co-authored-by: Nicholson Koukpaizan <72402802+nkoukpaizan@users.noreply.github.com> --- CMakeLists.txt | 30 ++++++++++--------------- CMakePresets.json | 11 +++++----- cmake/ReSolveConfig.cmake.in | 12 ++++++++++ cmake/ReSolveFindHipLibraries.cmake | 16 +++++--------- resolve/CMakeLists.txt | 34 ++++++----------------------- 5 files changed, 40 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f802231..db4e8e74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,42 +23,31 @@ endif() option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF) option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON) -option(RESOLVE_USE_GPU "Use GPU device for computations" OFF) option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF) option(RESOLVE_USE_HIP "Use HIP language and ROCm library" OFF) -set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") + +option(RESOLVE_USE_GPU "Use GPU device for computations" OFF) +mark_as_advanced(FORCE RESOLVE_USE_GPU) if(RESOLVE_USE_CUDA) - set(RESOLVE_USE_GPU On CACHE BOOL "Using CUDA GPU!" FORCE) + set(RESOLVE_USE_GPU ON CACHE BOOL "Using CUDA GPU!" FORCE) endif() if(RESOLVE_USE_HIP) - set(RESOLVE_USE_GPU On CACHE BOOL "Using HIP GPU!" FORCE) + set(RESOLVE_USE_GPU ON CACHE BOOL "Using HIP GPU!" FORCE) endif() - +# MacOS specific things set(CMAKE_MACOSX_RPATH 1) -# set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") -# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath -# use, i.e. don't skip the full RPATH for the build tree -#set(CMAKE_SKIP_BUILD_RPATH FALSE) -# when building, don't use the install RPATH already -# (but later on when installing) +# Install with RPATH but do not build with it set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) - set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) -#list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) # Add CMake sources from `cmake` dir list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) -# Including clang-format cmake files to do automatic checking of formating -# TODO: Set up clang-format -#include(./cmake/clang-format) - if (RESOLVE_USE_KLU) include(FindKLU) if(NOT KLU_LIBRARY) @@ -100,6 +89,7 @@ if(RESOLVE_USE_HIP) # This is just an agly hack to make HIP build work get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) message(STATUS "HIP include directories: ${hip_includes}") + # TODO - use targets properly include_directories(${hip_includes}) else() message(STATUS "Not using HIP") @@ -112,6 +102,7 @@ configure_file( ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp) # include build directory for Fortran name mangling header +# TODO - target based includes include_directories(${CMAKE_BINARY_DIR}) install( @@ -119,7 +110,7 @@ install( DESTINATION include/resolve ) - +# TODO - fix this include_directories(${CMAKE_SOURCE_DIR}) # Enable testing @@ -158,4 +149,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake" add_subdirectory(examples) # Add tests +set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") add_subdirectory(tests) diff --git a/CMakePresets.json b/CMakePresets.json index e4784095..c00f9919 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -12,7 +12,10 @@ "description": "Base config to build with CUDA", "binaryDir": "${sourceDir}/build", "installDir": "${sourceDir}/install", - "generator": "Unix Makefiles" + "generator": "Unix Makefiles", + "cacheVariables": { + "RESOLVE_USE_CUDA": "ON" + } }, { "name": "cpu", @@ -20,11 +23,7 @@ "description": "Base config to build without GPUs", "binaryDir": "${sourceDir}/build", "installDir": "${sourceDir}/install", - "generator": "Unix Makefiles", - "cacheVariables": { - "RESOLVE_USE_CUDA": "OFF", - "RESOLVE_USE_GPU": "OFF" - } + "generator": "Unix Makefiles" }, { "name": "ascent", diff --git a/cmake/ReSolveConfig.cmake.in b/cmake/ReSolveConfig.cmake.in index 7a162d90..47f9fe35 100644 --- a/cmake/ReSolveConfig.cmake.in +++ b/cmake/ReSolveConfig.cmake.in @@ -12,6 +12,18 @@ if(@RESOLVE_USE_CUDA@) check_language(CUDA) set(CMAKE_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@") find_package(CUDAToolkit REQUIRED) + add_library(ReSolve::CUDA ALIAS ReSolve::resolve_backend_cuda) +endif() +if(@RESOLVE_USE_HIP@) + enable_language(HIP) + check_language(HIP) + find_package(hip REQUIRED) + find_package(hipblas REQUIRED) + # This is just an agly hack to make HIP build work + get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) + message(STATUS "HIP include directories: ${hip_includes}") + include_directories(${hip_includes}) + add_library(ReSolve::HIP ALIAS ReSolve::resolve_backend_hip) endif() # Compute installation prefix relative to this file. diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake index e754da0d..4cb0c443 100644 --- a/cmake/ReSolveFindHipLibraries.cmake +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -4,20 +4,14 @@ add_library(resolve_hip INTERFACE) find_package(hip REQUIRED) -find_package(hipblas REQUIRED) +find_package(rocblas REQUIRED) +find_package(rocsparse REQUIRED) target_link_libraries(resolve_hip INTERFACE - #hip::host + hip::host hip::device - rocblas - rocsparse - #roc::hipblas + roc::rocblas + roc::rocsparse ) -# get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) -# message(STATUS "HIP include directories: ${hip_includes}") - -# get_target_property(resolve_hip_includes resolve_hip INTERFACE_INCLUDE_DIRECTORIES) -# message(STATUS "ReSolve HIP include directories: ${resolve_hip_includes}") - install(TARGETS resolve_hip EXPORT ReSolveTargets) diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index fa6c9cd5..68b557b8 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -37,21 +37,6 @@ set(ReSolve_HEADER_INSTALL MemoryUtils.hpp ) -# If GPU support is not enabled, add dummy device backend -if(NOT RESOLVE_USE_GPU) - add_subdirectory(cpu) -endif() - -# If CUDA support is enabled, create CUDA backend -# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) -if(RESOLVE_USE_CUDA) - add_subdirectory(cuda) -endif() - -if(RESOLVE_USE_HIP) - add_subdirectory(hip) -endif() - # Now, build workspaces add_subdirectory(workspace) @@ -59,23 +44,13 @@ add_subdirectory(workspace) add_subdirectory(vector) add_subdirectory(matrix) - # Build shared library ReSolve add_library(resolve_tpl INTERFACE) if(RESOLVE_USE_KLU) - target_link_libraries(resolve_tpl INTERFACE KLU) + target_link_libraries(resolve_tpl INTERFACE KLU) endif(RESOLVE_USE_KLU) -if(RESOLVE_USE_CUDA) - target_link_libraries(resolve_tpl INTERFACE resolve_cuda) -endif(RESOLVE_USE_CUDA) - -if(RESOLVE_USE_HIP) - target_link_libraries(resolve_tpl INTERFACE resolve_hip) -endif(RESOLVE_USE_HIP) - - set(ReSolve_Targets_List resolve_matrix resolve_vector @@ -86,18 +61,23 @@ set(ReSolve_Targets_List # If CUDA support is enabled add CUDA SDK specific code and dependencies if(RESOLVE_USE_CUDA) + add_subdirectory(cuda) + target_link_libraries(resolve_tpl INTERFACE resolve_cuda) set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC}) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) endif() # If HIP support is enabled add HIP SDK specific code and dependencies if(RESOLVE_USE_HIP) + add_subdirectory(hip) + target_link_libraries(resolve_tpl INTERFACE resolve_hip) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip) endif() # If no GPU support is enabled, link to dummy device backend if(NOT RESOLVE_USE_GPU) - set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) + add_subdirectory(cpu) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) endif() # Set installable targets From e01ba3c90ccc3d7d4ad5bf22ed36ab7eb4c1b93b Mon Sep 17 00:00:00 2001 From: pelesh Date: Wed, 1 Nov 2023 20:59:37 -0400 Subject: [PATCH 06/12] Fully functional ROCm-based LU solver (#52) * rocsolver class * rocsolver-rf functionality test runs * rocsolver EXAMPLE * Cleanup before exiting examples and avoid double delete. (#49) * Enable HIP support in Gramm-Schmidt and FGMRES (#50) * Enable HIP in GS and FGMRES * Do not build FGMRES without GPU. --------- Co-authored-by: kswirydo --------- Co-authored-by: kswirydo --- examples/CMakeLists.txt | 13 + examples/r_KLU_rf.cpp | 4 - examples/r_KLU_rf_FGMRES.cpp | 10 +- .../r_KLU_rf_FGMRES_reuse_factorization.cpp | 10 +- examples/r_KLU_rocSolverRf_FGMRES.cpp | 199 ++++++++++++++ examples/r_KLU_rocsolverrf.cpp | 177 ++++++++++++ resolve/CMakeLists.txt | 19 +- resolve/GramSchmidt.cpp | 39 ++- resolve/LinSolverDirectRocSolverRf.cpp | 205 ++++++++++++++ resolve/LinSolverDirectRocSolverRf.hpp | 59 ++++ resolve/LinSolverIterativeFGMRES.cpp | 39 +-- resolve/LinSolverIterativeFGMRES.hpp | 10 +- resolve/matrix/MatrixHandler.cpp | 1 - resolve/matrix/MatrixHandlerHip.cpp | 3 +- resolve/matrix/MatrixHandlerHip.hpp | 10 +- resolve/vector/Vector.cpp | 4 +- tests/functionality/CMakeLists.txt | 18 ++ tests/functionality/testKLU_Rf_FGMRES.cpp | 2 +- tests/functionality/testKLU_RocSolver.cpp | 251 ++++++++++++++++++ 19 files changed, 1015 insertions(+), 58 deletions(-) create mode 100644 examples/r_KLU_rocSolverRf_FGMRES.cpp create mode 100644 examples/r_KLU_rocsolverrf.cpp create mode 100644 resolve/LinSolverDirectRocSolverRf.cpp create mode 100644 resolve/LinSolverDirectRocSolverRf.hpp create mode 100644 tests/functionality/testKLU_RocSolver.cpp diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8e8a2498..9113ce17 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,7 +39,16 @@ if(RESOLVE_USE_CUDA) endif(RESOLVE_USE_CUDA) +# Create HIP examples +if(RESOLVE_USE_HIP) + # Build example with KLU factorization and rocsolver Rf refactorization + add_executable(klu_rocsolverrf.exe r_KLU_rocsolverrf.cpp) + target_link_libraries(klu_rocsolverrf.exe PRIVATE ReSolve) + # Build example with KLU factorization, rocsolver Rf refactorization, and FGMRES iterative refinement + add_executable(klu_rocsolverrf_fgmres.exe r_KLU_rocSolverRf_FGMRES.cpp) + target_link_libraries(klu_rocsolverrf_fgmres.exe PRIVATE ReSolve) +endif(RESOLVE_USE_HIP) # Install all examples in bin directory set(installable_executables klu_klu.exe klu_klu_standalone.exe) @@ -48,6 +57,10 @@ if(RESOLVE_USE_CUDA) set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + set(installable_executables ${installable_executables} klu_rocsolverrf.exe) +endif(RESOLVE_USE_HIP) + install(TARGETS ${installable_executables} RUNTIME DESTINATION bin) diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp index d9310773..b61029c5 100644 --- a/examples/r_KLU_rf.cpp +++ b/examples/r_KLU_rf.cpp @@ -139,12 +139,8 @@ int main(int argc, char *argv[] ) index_type* Q = KLU->getQOrdering(); Rf->setup(A, L, U, P, Q); - delete [] P; - delete [] Q; delete L; - delete L_csc; delete U; - delete U_csc; } } else { //status = KLU->refactorize(); diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp index 6df5419a..584fcd10 100644 --- a/examples/r_KLU_rf_FGMRES.cpp +++ b/examples/r_KLU_rf_FGMRES.cpp @@ -168,7 +168,7 @@ int main(int argc, char *argv[]) //matrix_handler->setValuesChanged(true, "cuda"); FGMRES->resetMatrix(A); - FGMRES->setupPreconditioner("CuSolverRf", Rf); + FGMRES->setupPreconditioner("LU", Rf); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); @@ -189,8 +189,16 @@ int main(int argc, char *argv[]) } // for (int i = 0; i < numSystems; ++i) + delete A; + delete KLU; + delete Rf; delete [] x; delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_CUDA; + delete matrix_handler; + delete vector_handler; return 0; } diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp index 5ead8186..c4ab285b 100644 --- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp +++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp @@ -173,7 +173,7 @@ int main(int argc, char *argv[]) << status << std::endl; vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->solve(vec_rhs, vec_x); - FGMRES->setupPreconditioner("CuSolverRf", Rf); + FGMRES->setupPreconditioner("LU", Rf); } //if (i%2!=0) vec_x->setToZero(ReSolve::memory::DEVICE); real_type norm_x = vector_handler->dot(vec_x, vec_x, "cuda"); @@ -217,8 +217,16 @@ int main(int argc, char *argv[]) } + delete A; + delete KLU; + delete Rf; delete [] x; delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_CUDA; + delete matrix_handler; + delete vector_handler; return 0; } diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp new file mode 100644 index 00000000..d2e5f7a6 --- /dev/null +++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp @@ -0,0 +1,199 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + index_type numSystems = atoi(argv[3]); + std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip"); + + for (int i = 0; i < numSystems; ++i) + { + index_type j = 4 + i * 2; + fileId = argv[j]; + rhsId = argv[j + 1]; + + matrixFileNameFull = ""; + rhsFileNameFull = ""; + + // Read matrix first + matrixFileNameFull = matrixFileName + fileId + ".mtx"; + rhsFileNameFull = rhsFileName + rhsId + ".mtx"; + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); + vec_r = new vector_type(A->getNumRows()); + } + else { + ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); + ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); + } + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + } else { + matrix_handler->coo2csr(A_coo,A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + } + std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<setupParameters(1, 0.1, false); + } + int status; + real_type norm_b; + if (i < 2){ + KLU->setup(A); + matrix_handler->setValuesChanged(true, "hip"); + status = KLU->analyze(); + std::cout<<"KLU analysis status: "<factorize(); + std::cout<<"KLU factorization status: "<solve(vec_rhs, vec_x); + std::cout<<"KLU solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "hip"); + norm_b = sqrt(norm_b); + matrix_handler->setValuesChanged(true, "hip"); + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "hip"))/norm_b); + if (i == 1) { + ReSolve::matrix::Csc* L /* _csc */ = (ReSolve::matrix::Csc*) KLU->getLFactor(); + ReSolve::matrix::Csc* U /* _csc */ = (ReSolve::matrix::Csc*) KLU->getUFactor(); + // ReSolve::matrix::Csr* L = new ReSolve::matrix::Csr(L_csc->getNumRows(), L_csc->getNumColumns(), L_csc->getNnz()); + // ReSolve::matrix::Csr* U = new ReSolve::matrix::Csr(U_csc->getNumRows(), U_csc->getNumColumns(), U_csc->getNnz()); + // matrix_handler->csc2csr(L_csc,L, "hip"); + // matrix_handler->csc2csr(U_csc,U, "hip"); + if (L == nullptr) {printf("ERROR");} + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + Rf->setup(A, L, U, P, Q, vec_rhs); + Rf->refactorize(); + std::cout<<"about to set FGMRES" <setup(A->getNumRows(), FGMRES->getRestart()); + FGMRES->setup(A); + } + } else { + //status = KLU->refactorize(); + std::cout<<"Using ROCSOLVER RF"<refactorize(); + std::cout<<"ROCSOLVER RF refactorization status: "<solve(vec_rhs, vec_x); + std::cout<<"ROCSOLVER RF solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "hip"); + norm_b = sqrt(norm_b); + + //matrix_handler->setValuesChanged(true, "hip"); + FGMRES->resetMatrix(A); + FGMRES->setupPreconditioner("LU", Rf); + + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + real_type rnrm = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + std::cout << "\t 2-Norm of the residual (before IR): " + << std::scientific << std::setprecision(16) + << rnrm/norm_b << "\n"; + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + if(!std::isnan(rnrm) && !std::isinf(rnrm)) { + FGMRES->solve(vec_rhs, vec_x); + + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; + } + } + + } // for (int i = 0; i < numSystems; ++i) + + delete [] x; + delete [] rhs; + + return 0; +} diff --git a/examples/r_KLU_rocsolverrf.cpp b/examples/r_KLU_rocsolverrf.cpp new file mode 100644 index 00000000..b3ebbecf --- /dev/null +++ b/examples/r_KLU_rocsolverrf.cpp @@ -0,0 +1,177 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[] ) +{ + // Use the same data types as those you specified in ReSolve build. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + index_type numSystems = atoi(argv[3]); + std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + + for (int i = 0; i < numSystems; ++i) + { + index_type j = 4 + i * 2; + fileId = argv[j]; + rhsId = argv[j + 1]; + + matrixFileNameFull = ""; + rhsFileNameFull = ""; + + // Read matrix first + matrixFileNameFull = matrixFileName + fileId + ".mtx"; + rhsFileNameFull = rhsFileName + rhsId + ".mtx"; + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_r = new vector_type(A->getNumRows()); + } + else { + ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); + ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); + } + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + } else { + matrix_handler->coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + } + std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<setupParameters(1, 0.1, false); + } + int status; + if (i < 2){ + KLU->setup(A); + status = KLU->analyze(); + std::cout<<"KLU analysis status: "<factorize(); + std::cout<<"KLU factorization status: "<solve(vec_rhs, vec_x); + std::cout<<"KLU solve status: "<getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + Rf->setup(A, L, U, P, Q, vec_rhs); + Rf->refactorize(); + //dont do it here + // delete [] P; + // delete [] Q; + } + } else { + //status = KLU->refactorize(); + std::cout<<"Using rocsolver rf"<refactorize(); + std::cout<<"rocsolver rf refactorization status: "<solve(vec_rhs, vec_x); + std::cout<<"rocsolver rf solve status: "<solve(vec_rhs, vec_x); + //std::cout<<"KLU solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + matrix_handler->setValuesChanged(true, "hip"); + + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "hip")) << "\n"; + + } // for (int i = 0; i < numSystems; ++i) + + //now DELETE + delete A; + delete KLU; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_HIP; + delete matrix_handler; + delete vector_handler; + return 0; +} diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 68b557b8..47ce70de 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -14,14 +14,21 @@ set(ReSolve_SRC LinSolverDirectKLU.cpp ) +# Temporary until there is CPU-only option for FGMRES +set(ReSolve_GPU_SRC + GramSchmidt.cpp + LinSolverIterativeFGMRES.cpp +) + # C++ code that links to CUDA SDK libraries set(ReSolve_CUDASDK_SRC - LinSolverIterativeFGMRES.cpp - GramSchmidt.cpp LinSolverDirectCuSolverGLU.cpp LinSolverDirectCuSolverRf.cpp ) - +# HIP files +set(ReSolve_ROCM_SRC + LinSolverDirectRocSolverRf.cpp +) # Header files to be installed set(ReSolve_HEADER_INSTALL Common.hpp @@ -59,6 +66,11 @@ set(ReSolve_Targets_List resolve_workspace ) +# Temporary until there is CPU-only option for FGMRES +if(RESOLVE_USE_GPU) + set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_GPU_SRC}) +endif() + # If CUDA support is enabled add CUDA SDK specific code and dependencies if(RESOLVE_USE_CUDA) add_subdirectory(cuda) @@ -71,6 +83,7 @@ endif() if(RESOLVE_USE_HIP) add_subdirectory(hip) target_link_libraries(resolve_tpl INTERFACE resolve_hip) + set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_ROCM_SRC}) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip) endif() diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp index fb86fc8d..8f6f0850 100644 --- a/resolve/GramSchmidt.cpp +++ b/resolve/GramSchmidt.cpp @@ -127,7 +127,7 @@ namespace ReSolve { using namespace constants; - if (memspace == "cuda") { // or hip + if ((memspace == "cuda") || (memspace == "hip")) { // or hip double t; double s; @@ -139,19 +139,19 @@ namespace ReSolve for(int j = 0; j <= i; ++j) { t = 0.0; vec_v_->setData( V->getVectorData(j, memory::DEVICE), memory::DEVICE); - t = vector_handler_->dot(vec_v_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_v_, vec_w_, memspace); H[ idxmap(i, j, num_vecs_ + 1) ] = t; t *= -1.0; - vector_handler_->axpy(&t, vec_v_, vec_w_, "cuda"); + vector_handler_->axpy(&t, vec_v_, vec_w_, memspace); } t = 0.0; - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0/t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n"); return -1; @@ -160,10 +160,9 @@ namespace ReSolve case cgs2: vec_v_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, "cuda"); - + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol - vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); + vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); // copy H_col to aux, we will need it later vec_Hcolumn_->setDataUpdated(memory::DEVICE); @@ -171,10 +170,10 @@ namespace ReSolve vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST); //Hcol = V(:,1:i)^T*V(:,i+1); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, "cuda"); + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol - vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); + vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); // copy H_col to H vec_Hcolumn_->setDataUpdated(memory::DEVICE); @@ -186,13 +185,13 @@ namespace ReSolve H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; } - t = vector_handler_->dot(vec_v_, vec_v_, "cuda"); + t = vector_handler_->dot(vec_v_, vec_v_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0/t; - vector_handler_->scal(&t, vec_v_, "cuda"); + vector_handler_->scal(&t, vec_v_, memspace); } else { assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n"); return -1; @@ -205,7 +204,7 @@ namespace ReSolve vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); - vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); + vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace); vec_rv_->setDataUpdated(memory::DEVICE); vec_rv_->copyData(memory::DEVICE, memory::HOST); @@ -226,16 +225,16 @@ namespace ReSolve } // for j vec_Hcolumn_->setCurrentSize(i + 1); vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); - vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); + vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, memspace); // normalize (second synch) - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1)] = t; if(fabs(t) > EPSILON) { t = 1.0 / t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n"); return -1; @@ -247,7 +246,7 @@ namespace ReSolve vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); - vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); + vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace); vec_rv_->setDataUpdated(memory::DEVICE); vec_rv_->copyData(memory::DEVICE, memory::HOST); @@ -297,15 +296,15 @@ namespace ReSolve vec_Hcolumn_->setCurrentSize(i + 1); vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); - vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); + vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, memspace); // normalize (second synch) - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0 / t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n"); return -1; diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp new file mode 100644 index 00000000..5869756d --- /dev/null +++ b/resolve/LinSolverDirectRocSolverRf.cpp @@ -0,0 +1,205 @@ +#include +#include +#include "LinSolverDirectRocSolverRf.hpp" + +namespace ReSolve +{ + LinSolverDirectRocSolverRf::LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace) + { + workspace_ = workspace; + infoM_ = nullptr; + solve_mode_ = 0; //solve mode - slow mode is default + } + + LinSolverDirectRocSolverRf::~LinSolverDirectRocSolverRf() + { + mem_.deleteOnDevice(d_P_); + mem_.deleteOnDevice(d_Q_); + } + + int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs) + { + //remember - P and Q are generally CPU variables + int error_sum = 0; + this->A_ = (matrix::Csr*) A; + index_type n = A_->getNumRows(); + //set matrix info + rocsolver_create_rfinfo(&infoM_, workspace_->getRocblasHandle()); + //create combined factor + addFactors(L,U); + M_->setUpdated(ReSolve::memory::HOST); + M_->copyData(ReSolve::memory::DEVICE); + mem_.allocateArrayOnDevice(&d_P_, n); + mem_.allocateArrayOnDevice(&d_Q_, n); + + mem_.copyArrayHostToDevice(d_P_, P, n); + mem_.copyArrayHostToDevice(d_Q_, Q, n); + + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_analysis(workspace_->getRocblasHandle(), + n, + 1, + A_->getNnzExpanded(), + A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_, + A_->getColData(ReSolve::memory::DEVICE), //jCol_, + A_->getValues(ReSolve::memory::DEVICE), //vals_, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + d_P_, + d_Q_, + rhs->getData(ReSolve::memory::DEVICE), + n, + infoM_); + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + + return error_sum; + } + + int LinSolverDirectRocSolverRf::refactorize() + { + int error_sum = 0; + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_refactlu(workspace_->getRocblasHandle(), + A_->getNumRows(), + A_->getNnzExpanded(), + A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_, + A_->getColData(ReSolve::memory::DEVICE), //jCol_, + A_->getValues(ReSolve::memory::DEVICE), //vals_, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //OUTPUT, + d_P_, + d_Q_, + infoM_); + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + return error_sum; + } + + // solution is returned in RHS + int LinSolverDirectRocSolverRf::solve(vector_type* rhs) + { + if (solve_mode_ == 0) { + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), + A_->getNumRows(), + 1, + M_->getNnz(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), + d_P_, + d_Q_, + rhs->getData(ReSolve::memory::DEVICE), + A_->getNumRows(), + infoM_); + mem_.deviceSynchronize(); + } else { + // not implemented yet + } + return status_rocblas_; + } + + int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x) + { + x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE); + x->setDataUpdated(ReSolve::memory::DEVICE); + + if (solve_mode_ == 0) { + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), + A_->getNumRows(), + 1, + M_->getNnz(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), + d_P_, + d_Q_, + x->getData(ReSolve::memory::DEVICE), + A_->getNumRows(), + infoM_); + mem_.deviceSynchronize(); + } else { + // not implemented yet + } + return status_rocblas_; + } + + int LinSolverDirectRocSolverRf::setSolveMode(int mode) + { + solve_mode_ = mode; + return 0; + } + + int LinSolverDirectRocSolverRf::getSolveMode() + { + return solve_mode_; + } + + void LinSolverDirectRocSolverRf::addFactors(matrix::Sparse* L, matrix::Sparse* U) + { + // L and U need to be in CSC format + index_type n = L->getNumRows(); + index_type* Lp = L->getColData(ReSolve::memory::HOST); + index_type* Li = L->getRowData(ReSolve::memory::HOST); + index_type* Up = U->getColData(ReSolve::memory::HOST); + index_type* Ui = U->getRowData(ReSolve::memory::HOST); + + index_type nnzM = ( L->getNnz() + U->getNnz() - n ); + M_ = new matrix::Csr(n, n, nnzM); + M_->allocateMatrixData(ReSolve::memory::DEVICE); + M_->allocateMatrixData(ReSolve::memory::HOST); + index_type* mia = M_->getRowData(ReSolve::memory::HOST); + index_type* mja = M_->getColData(ReSolve::memory::HOST); + index_type row; + for(index_type i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(index_type j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + // BUT dont count diagonal twice, important + if(row != i) { + mia[row + 1]++; + } + } + // then each column of U + for(index_type j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mia[row + 1]++; + } + } + // then organize mia_; + mia[0] = 0; + for(index_type i = 1; i < n + 1; i++) { + mia[i] += mia[i - 1]; + } + + std::vector Mshifts(n, 0); + for(index_type i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + if(row != i) { + // place (row, i) where it belongs! + mja[mia[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + // each column of U next + for(index_type j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mja[mia[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + //Mshifts.~vector(); + } +}// namespace resolve diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp new file mode 100644 index 00000000..5804393f --- /dev/null +++ b/resolve/LinSolverDirectRocSolverRf.hpp @@ -0,0 +1,59 @@ +#pragma once +#include "Common.hpp" +#include "LinSolver.hpp" +#include +#include + +#include +#include +#include +#include +#include + +namespace ReSolve +{ + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + // Forward declaration of matrix::Sparse class + namespace matrix + { + class Sparse; + } + + class LinSolverDirectRocSolverRf : public LinSolverDirect + { + using vector_type = vector::Vector; + + public: + LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace); + ~LinSolverDirectRocSolverRf(); + + int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs); + + int refactorize(); + int solve(vector_type* rhs, vector_type* x); + int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten) + + int setSolveMode(int mode); // should probably be enum + int getSolveMode(); //should be enum too + + private: + rocblas_status status_rocblas_; + + index_type* d_P_; + index_type* d_Q_; + + MemoryHandler mem_; ///< Device memory manager object + LinAlgWorkspaceHIP* workspace_; + + // to be exported to matrix handler in a later time + void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors + rocsolver_rfinfo infoM_; + matrix::Sparse* M_;//the matrix that contains added factors + int solve_mode_; + }; +} diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index 0bf1720f..40fdb22c 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -10,8 +10,9 @@ namespace ReSolve { using out = io::Logger; - LinSolverIterativeFGMRES::LinSolverIterativeFGMRES() + LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = nullptr; this->vector_handler_ = nullptr; tol_ = 1e-14; //default @@ -25,8 +26,10 @@ namespace ReSolve LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs) + GramSchmidt* gs, + std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = matrix_handler; this->vector_handler_ = vector_handler; this->GS_ = gs; @@ -46,8 +49,10 @@ namespace ReSolve index_type conv_cond, MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs) + GramSchmidt* gs, + std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = matrix_handler; this->vector_handler_ = vector_handler; this->GS_ = gs; @@ -113,12 +118,15 @@ namespace ReSolve vector_type* vec_v = new vector_type(n_); vector_type* vec_z = new vector_type(n_); //V[0] = b-A*x_0 + //debug + d_Z_->setToZero(memory::DEVICE); + d_V_->setToZero(memory::DEVICE); rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); - matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", "cuda"); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", memspace_); rnorm = 0.0; - bnorm = vector_handler_->dot(rhs, rhs, "cuda"); - rnorm = vector_handler_->dot(d_V_, d_V_, "cuda"); + bnorm = vector_handler_->dot(rhs, rhs, memspace_); + rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); //rnorm = ||V_1|| rnorm = sqrt(rnorm); @@ -154,7 +162,7 @@ namespace ReSolve // normalize first vector t = 1.0 / rnorm; - vector_handler_->scal(&t, d_V_, "cuda"); + vector_handler_->scal(&t, d_V_, memspace_); // initialize norm history h_rs_[0] = rnorm; i = -1; @@ -175,11 +183,11 @@ namespace ReSolve vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); - matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", "cuda"); + matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", memspace_); // orthogonalize V[i+1], form a column of h_H_ - GS_->orthogonalize(n_, d_V_, h_H_, i, "cuda"); ; + GS_->orthogonalize(n_, d_V_, h_H_, i, memspace_); ; if(i != 0) { for(int k = 1; k <= i; k++) { k1 = k - 1; @@ -188,7 +196,6 @@ namespace ReSolve h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k]; } } // if i!=0 - double Hii = h_H_[i * (restart_ + 1) + i]; double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1]; double gam = sqrt(Hii * Hii + Hii1 * Hii1); @@ -229,7 +236,7 @@ namespace ReSolve // get solution for(j = 0; j <= i; j++) { vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); - vector_handler_->axpy(&h_rs_[j], vec_z, x, "cuda"); + vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_); } /* test solution */ @@ -240,8 +247,8 @@ namespace ReSolve } rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); - matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", "cuda"); - rnorm = vector_handler_->dot(d_V_, d_V_, "cuda"); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", memspace_); + rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); // rnorm = ||V_1|| rnorm = sqrt(rnorm); @@ -253,9 +260,9 @@ namespace ReSolve return 0; } - int LinSolverIterativeFGMRES::setupPreconditioner(std::string name, LinSolverDirect* LU_solver) + int LinSolverIterativeFGMRES::setupPreconditioner(std::string type, LinSolverDirect* LU_solver) { - if (name != "CuSolverRf") { + if (type != "LU") { out::warning() << "Only cusolverRf tri solve can be used as a preconditioner at this time." << std::endl; return 1; } else { @@ -308,7 +315,7 @@ namespace ReSolve int LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix) { A_ = new_matrix; - matrix_handler_->setValuesChanged(true, "cuda"); + matrix_handler_->setValuesChanged(true, memspace_); return 0; } diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp index 8b2c722d..a9fc5058 100644 --- a/resolve/LinSolverIterativeFGMRES.hpp +++ b/resolve/LinSolverIterativeFGMRES.hpp @@ -13,17 +13,19 @@ namespace ReSolve using vector_type = vector::Vector; public: - LinSolverIterativeFGMRES(); + LinSolverIterativeFGMRES(std::string memspace = "cuda"); LinSolverIterativeFGMRES( MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs); + GramSchmidt* gs, + std::string memspace = "cuda"); LinSolverIterativeFGMRES(index_type restart, real_type tol, index_type maxit, index_type conv_cond, MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs); + GramSchmidt* gs, + std::string memspace = "cuda"); ~LinSolverIterativeFGMRES(); int solve(vector_type* rhs, vector_type* x); @@ -48,6 +50,8 @@ namespace ReSolve private: //remember matrix handler and vector handler are inherited. + std::string memspace_; + real_type tol_; index_type maxit_; index_type restart_; diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 0a7124da..b2d4339f 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -295,7 +295,6 @@ namespace ReSolve { } else if (memspace == "cpu") { return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else if (memspace == "hip") { - printf("about to run mv"); return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else { out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl; diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp index b4f8e483..ff10e973 100644 --- a/resolve/matrix/MatrixHandlerHip.cpp +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -49,7 +49,6 @@ namespace ReSolve { if (!workspaceHIP->matvecSetup()) { //setup first, allocate, etc. - rocsparse_create_mat_descr(&(descrA)); rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero); rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general); @@ -69,6 +68,8 @@ namespace ReSolve { error_sum += status; mem_.deviceSynchronize(); + workspaceHIP->setSpmvMatrixDescriptor(descrA); + workspaceHIP->setSpmvMatrixInfo(infoA); workspaceHIP->matvecSetupDone(); } diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp index 7f06f3bd..37f11a7b 100644 --- a/resolve/matrix/MatrixHandlerHip.hpp +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -38,11 +38,11 @@ namespace ReSolve { int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); virtual int matvec(matrix::Sparse* A, - vector_type* vec_x, - vector_type* vec_result, - const real_type* alpha, - const real_type* beta, - std::string matrix_type); + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index 0a62bd02..3b4f9e72 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -140,7 +140,8 @@ namespace ReSolve { namespace vector { real_type* Vector::getData(index_type i, memory::MemorySpace memspace) { if ((memspace == memory::HOST) && (cpu_updated_ == false) && (gpu_updated_ == true )) { - copyData(memspace, memory::HOST); + // remember IN FIRST OUT SECOND!!! + copyData(memory::DEVICE, memspace); owns_cpu_data_ = true; } @@ -174,7 +175,6 @@ namespace ReSolve { namespace vector { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } - switch(control) { case 0: //cpu->cuda mem_.copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_); diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt index a6652c26..acc5ce60 100644 --- a/tests/functionality/CMakeLists.txt +++ b/tests/functionality/CMakeLists.txt @@ -26,6 +26,15 @@ if(RESOLVE_USE_CUDA) endif(RESOLVE_USE_CUDA) + +if(RESOLVE_USE_HIP) + + # Build KLU+rossolver test + add_executable(rocsolver_rf_test.exe testKLU_RocSolver.cpp) + target_link_libraries(rocsolver_rf_test.exe PRIVATE ReSolve) + +endif(RESOLVE_USE_HIP) + # Install tests set(installable_tests klu_klu_test.exe) @@ -36,6 +45,11 @@ if(RESOLVE_USE_CUDA) klu_glu_test.exe) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + set(installable_tests ${installable_tests} + rocsolver_rf_test.exe) +endif(RESOLVE_USE_HIP) + install(TARGETS ${installable_tests} RUNTIME DESTINATION bin/resolve/tests/functionality) @@ -50,3 +64,7 @@ if(RESOLVE_USE_CUDA) add_test(NAME klu_rf_fgmres_test COMMAND $ "${test_data_dir}") add_test(NAME klu_glu_test COMMAND $ "${test_data_dir}") endif(RESOLVE_USE_CUDA) + +if(RESOLVE_USE_HIP) + add_test(NAME rocsolver_rf_test COMMAND $ "${test_data_dir}") +endif(RESOLVE_USE_HIP) diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp index 6a81dac1..2e582e02 100644 --- a/tests/functionality/testKLU_Rf_FGMRES.cpp +++ b/tests/functionality/testKLU_Rf_FGMRES.cpp @@ -213,7 +213,7 @@ int main(int argc, char *argv[]) error_sum += status; FGMRES->resetMatrix(A); - status = FGMRES->setupPreconditioner("CuSolverRf", Rf); + status = FGMRES->setupPreconditioner("LU", Rf); error_sum += status; vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); diff --git a/tests/functionality/testKLU_RocSolver.cpp b/tests/functionality/testKLU_RocSolver.cpp new file mode 100644 index 00000000..9fd43ac1 --- /dev/null +++ b/tests/functionality/testKLU_RocSolver.cpp @@ -0,0 +1,251 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//author: KS +//functionality test to check whether rocsolver_rf works correctly. + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use ReSolve data types. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + using matrix_type = ReSolve::matrix::Sparse; + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status = 0; + + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + KLU->setupParameters(1, 0.1, false); + + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + // Input to this code is location of `data` directory where matrix files are stored + const std::string data_path = (argc == 2) ? argv[1] : "./"; + + + std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg200_AC_10.mtx"; + std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg200_AC_11.mtx"; + + std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg200_AC_10.mtx.ones"; + std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg200_AC_11.mtx.ones"; + + // Read first matrix + std::ifstream mat1(matrixFileName1); + if(!mat1.is_open()) + { + std::cout << "Failed to open file " << matrixFileName1 << "\n"; + return -1; + } + ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1); + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + mat1.close(); + + // Read first rhs vector + std::ifstream rhs1_file(rhsFileName1); + if(!rhs1_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName1 << "\n"; + return -1; + } + real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); + real_type* x = new real_type[A->getNumRows()]; + vector_type* vec_rhs = new vector_type(A->getNumRows()); + vector_type* vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); + vector_type* vec_r = new vector_type(A->getNumRows()); + rhs1_file.close(); + + // Convert first matrix to CSR format + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + + // Solve the first system using KLU + status = KLU->setup(A); + error_sum += status; + + status = KLU->analyze(); + error_sum += status; + + status = KLU->factorize(); + error_sum += status; + + status = KLU->solve(vec_rhs, vec_x); + error_sum += status; + + std::cout<<"KLU solve status: "<getLFactor(); + matrix_type* U = KLU->getUFactor(); + if (L == nullptr) {printf("ERROR");} + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_rhs->setDataUpdated(ReSolve::memory::DEVICE); + + status = Rf->setup(A, L, U, P, Q, vec_rhs); + error_sum += status; + std::cout<<"Rf setup status: "<refactorize(); + error_sum += status; + vector_type* vec_test; + vector_type* vec_diff; + vec_test = new vector_type(A->getNumRows()); + vec_diff = new vector_type(A->getNumRows()); + real_type* x_data = new real_type[A->getNumRows()]; + for (int i=0; igetNumRows(); ++i){ + x_data[i] = 1.0; + } + + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "hip")); + matrix_handler->setValuesChanged(true, "hip"); + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); + error_sum += status; + + real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + //for testing only - control + + real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip")); + real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + + //compute x-x_true + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + //evaluate the residual ON THE CPU using COMPUTED solution + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); + error_sum += status; + + real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (first matrix): "<coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // this hangs up + status = Rf->refactorize(); + error_sum += status; + + std::cout<<"rocSolverRf refactorization status: "<solve(vec_rhs, vec_x); + error_sum += status; + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + matrix_handler->setValuesChanged(true, "hip"); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + + real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + //for testing only - control + real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + //compute x-x_true + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (second matrix): "< Date: Wed, 1 Nov 2023 21:37:40 -0400 Subject: [PATCH 07/12] Apparently we need to add rocsolver library to the list of dependencies. --- cmake/ReSolveFindHipLibraries.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake index 4cb0c443..d0d22395 100644 --- a/cmake/ReSolveFindHipLibraries.cmake +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -12,6 +12,7 @@ target_link_libraries(resolve_hip INTERFACE hip::device roc::rocblas roc::rocsparse + rocsolver ) install(TARGETS resolve_hip EXPORT ReSolveTargets) From 05a5b2e7eb06a07981a8344c9aebf80d011d0cbb Mon Sep 17 00:00:00 2001 From: pelesh Date: Thu, 2 Nov 2023 11:36:28 -0400 Subject: [PATCH 08/12] A working alternative triangular solver (faster) for rocsolverrf (#56) * a WORKING alternative triangular solver (faster) for rocsolverrf --------- Co-authored-by: kswirydo --- examples/r_KLU_rocSolverRf_FGMRES.cpp | 3 +- resolve/LinSolverDirectRocSolverRf.cpp | 213 ++++++++++++++++++++++++- resolve/LinSolverDirectRocSolverRf.hpp | 22 ++- resolve/hip/hipKernels.h | 11 ++ resolve/hip/hipKernels.hip | 44 +++++ resolve/hip/hipVectorKernels.hip | 1 + 6 files changed, 287 insertions(+), 7 deletions(-) diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp index d2e5f7a6..45fe4681 100644 --- a/examples/r_KLU_rocSolverRf_FGMRES.cpp +++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp @@ -131,6 +131,7 @@ int main(int argc, char *argv[]) std::cout<<"KLU analysis status: "<factorize(); std::cout<<"KLU factorization status: "<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); @@ -149,6 +150,7 @@ int main(int argc, char *argv[]) if (L == nullptr) {printf("ERROR");} index_type* P = KLU->getPOrdering(); index_type* Q = KLU->getQOrdering(); + Rf->setSolveMode(1); Rf->setup(A, L, U, P, Q, vec_rhs); Rf->refactorize(); std::cout<<"about to set FGMRES" <solve(vec_rhs, vec_x); std::cout<<"ROCSOLVER RF solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); norm_b = vector_handler->dot(vec_r, vec_r, "hip"); norm_b = sqrt(norm_b); diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp index 5869756d..f9f73b4a 100644 --- a/resolve/LinSolverDirectRocSolverRf.cpp +++ b/resolve/LinSolverDirectRocSolverRf.cpp @@ -1,6 +1,7 @@ #include #include #include "LinSolverDirectRocSolverRf.hpp" +#include namespace ReSolve { @@ -15,6 +16,12 @@ namespace ReSolve { mem_.deleteOnDevice(d_P_); mem_.deleteOnDevice(d_Q_); + + mem_.deleteOnDevice(d_aux1_); + mem_.deleteOnDevice(d_aux2_); + + delete L_csr_; + delete U_csr_; } int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs) @@ -56,7 +63,109 @@ namespace ReSolve mem_.deviceSynchronize(); error_sum += status_rocblas_; + // tri solve setup + if (solve_mode_ == 1) { // fast mode + L_csr_ = new ReSolve::matrix::Csr(L->getNumRows(), L->getNumColumns(), L->getNnz()); + U_csr_ = new ReSolve::matrix::Csr(U->getNumRows(), U->getNumColumns(), U->getNnz()); + + L_csr_->allocateMatrixData(ReSolve::memory::DEVICE); + U_csr_->allocateMatrixData(ReSolve::memory::DEVICE); + + rocsparse_create_mat_descr(&(descr_L_)); + rocsparse_set_mat_fill_mode(descr_L_, rocsparse_fill_mode_lower); + rocsparse_set_mat_index_base(descr_L_, rocsparse_index_base_zero); + + rocsparse_create_mat_descr(&(descr_U_)); + rocsparse_set_mat_index_base(descr_U_, rocsparse_index_base_zero); + rocsparse_set_mat_fill_mode(descr_U_, rocsparse_fill_mode_upper); + + rocsparse_create_mat_info(&info_L_); + rocsparse_create_mat_info(&info_U_); + + // local variables + size_t L_buffer_size; + size_t U_buffer_size; + + status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(), + n, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + U_csr_->getValues(ReSolve::memory::DEVICE)); + + error_sum += status_rocblas_; + + status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + L_csr_->getNnz(), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + &L_buffer_size); + error_sum += status_rocsparse_; + + printf("buffer size for L %d status %d \n", L_buffer_size, status_rocsparse_); + // hipMalloc((void**)&(L_buffer), L_buffer_size); + + mem_.allocateBufferOnDevice(&L_buffer_, L_buffer_size); + status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + U_csr_->getNnz(), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + &U_buffer_size); + error_sum += status_rocsparse_; + // hipMalloc((void**)&(U_buffer), U_buffer_size); + mem_.allocateBufferOnDevice(&U_buffer_, U_buffer_size); + printf("buffer size for U %d status %d \n", U_buffer_size, status_rocsparse_); + + status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + L_csr_->getNnz(), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + rocsparse_analysis_policy_force, + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + if (status_rocsparse_!=0)printf("status after analysis 1 %d \n", status_rocsparse_); + status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + U_csr_->getNnz(), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + rocsparse_analysis_policy_force, + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + if (status_rocsparse_!=0)printf("status after analysis 2 %d \n", status_rocsparse_); + //allocate aux data + + mem_.allocateArrayOnDevice(&d_aux1_,n); + mem_.allocateArrayOnDevice(&d_aux2_,n); + } return error_sum; } @@ -78,15 +187,38 @@ namespace ReSolve d_Q_, infoM_); + mem_.deviceSynchronize(); error_sum += status_rocblas_; + if (solve_mode_ == 1) { + //split M, fill L and U with correct values +printf("solve mode 1, splitting the factors again \n"); + status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(), + A_->getNumRows(), + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + U_csr_->getValues(ReSolve::memory::DEVICE)); + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + } + return error_sum; } // solution is returned in RHS int LinSolverDirectRocSolverRf::solve(vector_type* rhs) { + int error_sum = 0; if (solve_mode_ == 0) { mem_.deviceSynchronize(); status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), @@ -104,15 +236,51 @@ namespace ReSolve mem_.deviceSynchronize(); } else { // not implemented yet + permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_); + mem_.deviceSynchronize(); + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + L_csr_->getNnz(), + &(constants::ONE), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + d_aux1_, + d_aux2_, //result + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + U_csr_->getNnz(), + &(constants::ONE), + descr_L_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + d_aux2_, //input + d_aux1_,//result + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + + permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,rhs->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); } - return status_rocblas_; + return error_sum; } int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x) { x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE); x->setDataUpdated(ReSolve::memory::DEVICE); - + int error_sum = 0; if (solve_mode_ == 0) { mem_.deviceSynchronize(); status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), @@ -127,11 +295,50 @@ namespace ReSolve x->getData(ReSolve::memory::DEVICE), A_->getNumRows(), infoM_); + error_sum += status_rocblas_; mem_.deviceSynchronize(); } else { // not implemented yet + + permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_); + mem_.deviceSynchronize(); + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + L_csr_->getNnz(), + &(constants::ONE), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + d_aux1_, + d_aux2_, //result + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + U_csr_->getNnz(), + &(constants::ONE), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + d_aux2_, //input + d_aux1_,//result + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + + permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,x->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); } - return status_rocblas_; + return error_sum; } int LinSolverDirectRocSolverRf::setSolveMode(int mode) diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp index 5804393f..eb3a11a6 100644 --- a/resolve/LinSolverDirectRocSolverRf.hpp +++ b/resolve/LinSolverDirectRocSolverRf.hpp @@ -42,8 +42,8 @@ namespace ReSolve int getSolveMode(); //should be enum too private: - rocblas_status status_rocblas_; - + rocblas_status status_rocblas_; + rocsparse_status status_rocsparse_; index_type* d_P_; index_type* d_Q_; @@ -54,6 +54,22 @@ namespace ReSolve void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors rocsolver_rfinfo infoM_; matrix::Sparse* M_;//the matrix that contains added factors - int solve_mode_; + int solve_mode_; // 0 is default and 1 is fast + + // not used by default - for fast solve + rocsparse_mat_descr descr_L_{nullptr}; + rocsparse_mat_descr descr_U_{nullptr}; + + rocsparse_mat_info info_L_{nullptr}; + rocsparse_mat_info info_U_{nullptr}; + + void* L_buffer_{nullptr}; + void* U_buffer_{nullptr}; + + ReSolve::matrix::Csr* L_csr_; + ReSolve::matrix::Csr* U_csr_; + + real_type* d_aux1_{nullptr}; + real_type* d_aux2_{nullptr}; }; } diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h index 9c48783a..986efc84 100644 --- a/resolve/hip/hipKernels.h +++ b/resolve/hip/hipKernels.h @@ -12,3 +12,14 @@ void matrix_row_sums(int n, int* a_ia, double* a_val, double* result); + +// needed for triangular solve + +void permuteVectorP(int n, + int* perm_vector, + double* vec_in, + double* vec_out); +void permuteVectorQ(int n, + int* perm_vector, + double* vec_in, + double* vec_out); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip index 13f53d85..abad5b39 100644 --- a/resolve/hip/hipKernels.hip +++ b/resolve/hip/hipKernels.hip @@ -143,6 +143,34 @@ __global__ void matrixInfNormPart1(const int n, } +__global__ void permuteVectorP_kernel(const int n, + const int* perm_vector, + const double* vec_in, + double* vec_out){ + + //one thread per vector entry, pass through rows + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx Date: Thu, 2 Nov 2023 14:36:24 -0400 Subject: [PATCH 09/12] working rocsolver FGMRES test (#58) Co-authored-by: kswirydo --- tests/functionality/CMakeLists.txt | 8 +- .../testKLU_RocSolver_FGMRES.cpp | 271 ++++++++++++++++++ 2 files changed, 278 insertions(+), 1 deletion(-) create mode 100644 tests/functionality/testKLU_RocSolver_FGMRES.cpp diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt index acc5ce60..85b47fd7 100644 --- a/tests/functionality/CMakeLists.txt +++ b/tests/functionality/CMakeLists.txt @@ -33,6 +33,10 @@ if(RESOLVE_USE_HIP) add_executable(rocsolver_rf_test.exe testKLU_RocSolver.cpp) target_link_libraries(rocsolver_rf_test.exe PRIVATE ReSolve) + # And another one to test FGMRES version + add_executable(rocsolver_rf_fgmres_test.exe testKLU_RocSolver_FGMRES.cpp) + target_link_libraries(rocsolver_rf_fgmres_test.exe PRIVATE ReSolve) + endif(RESOLVE_USE_HIP) # Install tests @@ -47,7 +51,8 @@ endif(RESOLVE_USE_CUDA) if(RESOLVE_USE_HIP) set(installable_tests ${installable_tests} - rocsolver_rf_test.exe) + rocsolver_rf_test.exe + rocsolver_rf_fgmres_test.exe) endif(RESOLVE_USE_HIP) install(TARGETS ${installable_tests} @@ -67,4 +72,5 @@ endif(RESOLVE_USE_CUDA) if(RESOLVE_USE_HIP) add_test(NAME rocsolver_rf_test COMMAND $ "${test_data_dir}") + add_test(NAME rocsolver_rf_fgmres_test COMMAND $ "${test_data_dir}") endif(RESOLVE_USE_HIP) diff --git a/tests/functionality/testKLU_RocSolver_FGMRES.cpp b/tests/functionality/testKLU_RocSolver_FGMRES.cpp new file mode 100644 index 00000000..a544eb54 --- /dev/null +++ b/tests/functionality/testKLU_RocSolver_FGMRES.cpp @@ -0,0 +1,271 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//author: KS +//functionality test to check whether cuSolverRf/FGMRES works correctly. + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use ReSolve data types. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status = 0; + + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + KLU->setupParameters(1, 0.1, false); + + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip"); + // Input to this code is location of `data` directory where matrix files are stored + const std::string data_path = (argc == 2) ? argv[1] : "./"; + + + std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg2000_AC_00.mtx"; + std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg2000_AC_02.mtx"; + + std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg2000_AC_00.mtx.ones"; + std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg2000_AC_02.mtx.ones"; + + + + // Read first matrix + std::ifstream mat1(matrixFileName1); + if(!mat1.is_open()) + { + std::cout << "Failed to open file " << matrixFileName1 << "\n"; + return -1; + } + ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1); + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + mat1.close(); + + // Read first rhs vector + std::ifstream rhs1_file(rhsFileName1); + if(!rhs1_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName1 << "\n"; + return -1; + } + real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); + real_type* x = new real_type[A->getNumRows()]; + vector_type* vec_rhs = new vector_type(A->getNumRows()); + vector_type* vec_x = new vector_type(A->getNumRows()); + vector_type* vec_r = new vector_type(A->getNumRows()); + rhs1_file.close(); + + // Convert first matrix to CSR format + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + + // Solve the first system using KLU + status = KLU->setup(A); + error_sum += status; + + status = KLU->analyze(); + error_sum += status; + + status = KLU->factorize(); + error_sum += status; + + status = KLU->solve(vec_rhs, vec_x); + error_sum += status; + + vector_type* vec_test; + vector_type* vec_diff; + + vec_test = new vector_type(A->getNumRows()); + vec_diff = new vector_type(A->getNumRows()); + real_type* x_data = new real_type[A->getNumRows()]; + + for (int i=0; igetNumRows(); ++i){ + x_data[i] = 1.0; + } + + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE)); + matrix_handler->setValuesChanged(true, "hip"); + //evaluate the residual ||b-Ax|| + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); + error_sum += status; + + real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + + //for testing only - control + + real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip")); + real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + + //compute x-x_true + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + //evaluate the residual ON THE CPU using COMPUTED solution + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); + error_sum += status; + + real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (first matrix): "<getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); + + if (L == nullptr) { + printf("ERROR"); + } + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + Rf->setSolveMode(1); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + error_sum += Rf->setup(A, L, U, P, Q, vec_rhs); + FGMRES->setMaxit(200); + FGMRES->setRestart(100); + + GS->setup(A->getNumRows(), FGMRES->getRestart()); + status = FGMRES->setup(A); + error_sum += status; + + // Load the second matrix + std::ifstream mat2(matrixFileName2); + if(!mat2.is_open()) + { + std::cout << "Failed to open file " << matrixFileName2 << "\n"; + return -1; + } + ReSolve::io::readAndUpdateMatrix(mat2, A_coo); + mat2.close(); + + // Load the second rhs vector + std::ifstream rhs2_file(rhsFileName2); + if(!rhs2_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName2 << "\n"; + return -1; + } + ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs); + rhs2_file.close(); + + matrix_handler->coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + status = Rf->refactorize(); + error_sum += status; + + vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = Rf->solve(vec_x); + error_sum += status; + + FGMRES->resetMatrix(A); + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = FGMRES->solve(vec_rhs, vec_x); + error_sum += status; + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + matrix_handler->setValuesChanged(true, "hip"); + + //evaluate final residual + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + + real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + + //for testing only - control + real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + //compute x-x_true + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + std::cout<<"Results (second matrix): "<getNumIter()<<" (max 200, restart 100)"<getInitResidualNorm()<<" "<getFinalResidualNorm()<<" (tol 1e-14)"< Date: Thu, 2 Nov 2023 21:15:19 -0400 Subject: [PATCH 10/12] Fix warnings in solver classes. (#62) --- examples/r_KLU_rocSolverRf_FGMRES.cpp | 17 +++++++++++------ examples/r_KLU_rocsolverrf.cpp | 8 +------- resolve/LinSolver.cpp | 22 ++++++++++++++++------ resolve/LinSolver.hpp | 9 ++++++++- resolve/LinSolverDirectCuSolverGLU.cpp | 9 ++++++++- resolve/LinSolverDirectCuSolverGLU.hpp | 7 ++++++- resolve/LinSolverDirectCuSolverRf.cpp | 7 ++++++- resolve/LinSolverDirectCuSolverRf.hpp | 7 ++++++- resolve/LinSolverDirectKLU.cpp | 7 ++++++- resolve/LinSolverDirectKLU.hpp | 8 +++++++- resolve/LinSolverDirectRocSolverRf.cpp | 22 +++++++++++----------- resolve/LinSolverDirectRocSolverRf.hpp | 7 ++++++- 12 files changed, 92 insertions(+), 38 deletions(-) diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp index 45fe4681..32d1865f 100644 --- a/examples/r_KLU_rocSolverRf_FGMRES.cpp +++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp @@ -141,12 +141,8 @@ int main(int argc, char *argv[]) matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "hip"))/norm_b); if (i == 1) { - ReSolve::matrix::Csc* L /* _csc */ = (ReSolve::matrix::Csc*) KLU->getLFactor(); - ReSolve::matrix::Csc* U /* _csc */ = (ReSolve::matrix::Csc*) KLU->getUFactor(); - // ReSolve::matrix::Csr* L = new ReSolve::matrix::Csr(L_csc->getNumRows(), L_csc->getNumColumns(), L_csc->getNnz()); - // ReSolve::matrix::Csr* U = new ReSolve::matrix::Csr(U_csc->getNumRows(), U_csc->getNumColumns(), U_csc->getNnz()); - // matrix_handler->csc2csr(L_csc,L, "hip"); - // matrix_handler->csc2csr(U_csc,U, "hip"); + ReSolve::matrix::Csc* L = (ReSolve::matrix::Csc*) KLU->getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); if (L == nullptr) {printf("ERROR");} index_type* P = KLU->getPOrdering(); index_type* Q = KLU->getQOrdering(); @@ -193,8 +189,17 @@ int main(int argc, char *argv[]) } // for (int i = 0; i < numSystems; ++i) + delete A; + delete A_coo; + delete KLU; + delete Rf; delete [] x; delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_HIP; + delete matrix_handler; + delete vector_handler; return 0; } diff --git a/examples/r_KLU_rocsolverrf.cpp b/examples/r_KLU_rocsolverrf.cpp index b3ebbecf..5651ed56 100644 --- a/examples/r_KLU_rocsolverrf.cpp +++ b/examples/r_KLU_rocsolverrf.cpp @@ -135,20 +135,13 @@ int main(int argc, char *argv[] ) vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); Rf->setup(A, L, U, P, Q, vec_rhs); Rf->refactorize(); - //dont do it here - // delete [] P; - // delete [] Q; } } else { - //status = KLU->refactorize(); std::cout<<"Using rocsolver rf"<refactorize(); std::cout<<"rocsolver rf refactorization status: "<solve(vec_rhs, vec_x); std::cout<<"rocsolver rf solve status: "<solve(vec_rhs, vec_x); - //std::cout<<"KLU solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); @@ -164,6 +157,7 @@ int main(int argc, char *argv[] ) //now DELETE delete A; + delete A_coo; delete KLU; delete Rf; delete [] x; diff --git a/resolve/LinSolver.cpp b/resolve/LinSolver.cpp index 558a6500..5682ec40 100644 --- a/resolve/LinSolver.cpp +++ b/resolve/LinSolver.cpp @@ -13,12 +13,6 @@ namespace ReSolve //destroy the matrix and hadlers } - int LinSolver::setup(matrix::Sparse* A) - { - this->A_ = A; - return 0; - } - real_type LinSolver::evaluateResidual() { //to be implemented @@ -42,6 +36,17 @@ namespace ReSolve delete [] Q_; } + int LinSolverDirect::setup(matrix::Sparse* A, + matrix::Sparse* /* L */, + matrix::Sparse* /* U */, + index_type* /* P */, + index_type* /* Q */, + vector_type* /* rhs */) + { + this->A_ = A; + return 0; + } + int LinSolverDirect::analyze() { return 0; @@ -92,6 +97,11 @@ namespace ReSolve { } + int LinSolverIterative::setup(matrix::Sparse* A) + { + this->A_ = A; + return 0; + } int LinSolverIterative::solve(vector_type* /* rhs */, vector_type* /* init_guess */) { diff --git a/resolve/LinSolver.hpp b/resolve/LinSolver.hpp index 8c9ca5c9..a34aeba0 100644 --- a/resolve/LinSolver.hpp +++ b/resolve/LinSolver.hpp @@ -31,7 +31,6 @@ namespace ReSolve LinSolver(); virtual ~LinSolver(); - virtual int setup(matrix::Sparse* A); real_type evaluateResidual(); protected: @@ -49,6 +48,13 @@ namespace ReSolve LinSolverDirect(); virtual ~LinSolverDirect(); //return 0 if successful! + virtual int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs); + virtual int analyze(); //the same as symbolic factorization virtual int factorize(); virtual int refactorize(); @@ -72,6 +78,7 @@ namespace ReSolve public: LinSolverIterative(); ~LinSolverIterative(); + virtual int setup(matrix::Sparse* A); virtual int solve(vector_type* rhs, vector_type* init_guess); }; diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp index 0350efea..65af5812 100644 --- a/resolve/LinSolverDirectCuSolverGLU.cpp +++ b/resolve/LinSolverDirectCuSolverGLU.cpp @@ -8,6 +8,8 @@ namespace ReSolve { + using vector_type = vector::Vector; + LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace) { this->workspace_ = workspace; @@ -22,7 +24,12 @@ namespace ReSolve delete M_; } - int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q) + int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* /* rhs */) { int error_sum = 0; diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp index a48c8cba..899f52e3 100644 --- a/resolve/LinSolverDirectCuSolverGLU.hpp +++ b/resolve/LinSolverDirectCuSolverGLU.hpp @@ -32,7 +32,12 @@ namespace ReSolve int refactorize(); int solve(vector_type* rhs, vector_type* x); - int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q); + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs = nullptr); private: void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp index 37a3ffda..905a0e6e 100644 --- a/resolve/LinSolverDirectCuSolverRf.cpp +++ b/resolve/LinSolverDirectCuSolverRf.cpp @@ -17,7 +17,12 @@ namespace ReSolve mem_.deleteOnDevice(d_T_); } - int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q) + int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* /* rhs */) { //remember - P and Q are generally CPU variables int error_sum = 0; diff --git a/resolve/LinSolverDirectCuSolverRf.hpp b/resolve/LinSolverDirectCuSolverRf.hpp index f0ee755e..77e8b94f 100644 --- a/resolve/LinSolverDirectCuSolverRf.hpp +++ b/resolve/LinSolverDirectCuSolverRf.hpp @@ -26,7 +26,12 @@ namespace ReSolve LinSolverDirectCuSolverRf(); ~LinSolverDirectCuSolverRf(); - int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q); + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs = nullptr); void setAlgorithms(cusolverRfFactorization_t fact_alg, cusolverRfTriangularSolve_t solve_alg); diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp index 6af27d10..6336e9e9 100644 --- a/resolve/LinSolverDirectKLU.cpp +++ b/resolve/LinSolverDirectKLU.cpp @@ -18,7 +18,12 @@ namespace ReSolve klu_free_numeric(&Numeric_, &Common_); } - int LinSolverDirectKLU::setup(matrix::Sparse* A) + int LinSolverDirectKLU::setup(matrix::Sparse* A, + matrix::Sparse* /* L */, + matrix::Sparse* /* U */, + index_type* /* P */, + index_type* /* Q */, + vector_type* /* rhs */) { this->A_ = A; return 0; diff --git a/resolve/LinSolverDirectKLU.hpp b/resolve/LinSolverDirectKLU.hpp index 13e27b47..b4edadb1 100644 --- a/resolve/LinSolverDirectKLU.hpp +++ b/resolve/LinSolverDirectKLU.hpp @@ -24,7 +24,13 @@ namespace ReSolve public: LinSolverDirectKLU(); ~LinSolverDirectKLU(); - int setup(matrix::Sparse* A); + + int setup(matrix::Sparse* A, + matrix::Sparse* L = nullptr, + matrix::Sparse* U = nullptr, + index_type* P = nullptr, + index_type* Q = nullptr, + vector_type* rhs = nullptr); void setupParameters(int ordering, double KLU_threshold, bool halt_if_singular); diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp index f9f73b4a..96d1da79 100644 --- a/resolve/LinSolverDirectRocSolverRf.cpp +++ b/resolve/LinSolverDirectRocSolverRf.cpp @@ -24,7 +24,12 @@ namespace ReSolve delete U_csr_; } - int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs) + int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs) { //remember - P and Q are generally CPU variables int error_sum = 0; @@ -113,9 +118,6 @@ namespace ReSolve &L_buffer_size); error_sum += status_rocsparse_; - printf("buffer size for L %d status %d \n", L_buffer_size, status_rocsparse_); - // hipMalloc((void**)&(L_buffer), L_buffer_size); - mem_.allocateBufferOnDevice(&L_buffer_, L_buffer_size); status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), rocsparse_operation_none, @@ -128,9 +130,7 @@ namespace ReSolve info_U_, &U_buffer_size); error_sum += status_rocsparse_; - // hipMalloc((void**)&(U_buffer), U_buffer_size); mem_.allocateBufferOnDevice(&U_buffer_, U_buffer_size); - printf("buffer size for U %d status %d \n", U_buffer_size, status_rocsparse_); status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), rocsparse_operation_none, @@ -389,22 +389,22 @@ printf("solve mode 1, splitting the factors again \n"); mia[i] += mia[i - 1]; } - std::vector Mshifts(n, 0); + std::vector Mshifts(static_cast(n), 0); for(index_type i = 0; i < n; ++i) { // go through EACH COLUMN OF L first for(int j = Lp[i]; j < Lp[i + 1]; ++j) { row = Li[j]; if(row != i) { // place (row, i) where it belongs! - mja[mia[row] + Mshifts[row]] = i; - Mshifts[row]++; + mja[mia[row] + Mshifts[static_cast(row)]] = i; + Mshifts[static_cast(row)]++; } } // each column of U next for(index_type j = Up[i]; j < Up[i + 1]; ++j) { row = Ui[j]; - mja[mia[row] + Mshifts[row]] = i; - Mshifts[row]++; + mja[mia[row] + Mshifts[static_cast(row)]] = i; + Mshifts[static_cast(row)]++; } } //Mshifts.~vector(); diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp index eb3a11a6..bb623fb2 100644 --- a/resolve/LinSolverDirectRocSolverRf.hpp +++ b/resolve/LinSolverDirectRocSolverRf.hpp @@ -32,7 +32,12 @@ namespace ReSolve LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace); ~LinSolverDirectRocSolverRf(); - int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs); + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs); int refactorize(); int solve(vector_type* rhs, vector_type* x); From 4b3bcf5629346a7da457aec75cbbd4687f15e801 Mon Sep 17 00:00:00 2001 From: Cameron Rutherford Date: Thu, 2 Nov 2023 21:52:26 -0400 Subject: [PATCH 11/12] Incline rocm 5.6.0 dev (#53) * Add working incline build * Update spack submodule with rocsolver. * Update spack and add rocsolver/roctracer to CMake. * Remove several blanket include_directories. * Fix exported CMake bug in CXX_STANDARD * Update examples/resolve_consumer/CMakeLists.txt --------- Co-authored-by: Ryan Danehy Co-authored-by: pelesh --- .gitlab/pnnl/base.gitlab-ci.yml | 2 +- .gitlab/pnnl/incline.gitlab-ci.yml | 6 +- CMakeLists.txt | 13 -- CMakePresets.json | 28 ++- buildsystem/incline-env.sh | 12 +- buildsystem/spack/incline/env.sh | 7 +- buildsystem/spack/incline/install.sh | 5 + .../spack/incline/modules/dependencies.sh | 204 ++++++++++++++++-- buildsystem/spack/incline/spack.yaml | 70 ++++-- buildsystem/spack/spack | 2 +- cmake/ReSolveConfig.cmake.in | 15 +- cmake/ReSolveFindHipLibraries.cmake | 11 +- examples/CMakeLists.txt | 3 + resolve/CMakeLists.txt | 1 + resolve/LinSolverDirectRocSolverRf.hpp | 1 - resolve/cpu/CMakeLists.txt | 5 - resolve/cuda/CMakeLists.txt | 5 +- resolve/hip/CMakeLists.txt | 10 +- resolve/utilities/logger/CMakeLists.txt | 5 +- resolve/workspace/CMakeLists.txt | 7 +- 20 files changed, 328 insertions(+), 84 deletions(-) diff --git a/.gitlab/pnnl/base.gitlab-ci.yml b/.gitlab/pnnl/base.gitlab-ci.yml index 092c8f19..4b5954cf 100644 --- a/.gitlab/pnnl/base.gitlab-ci.yml +++ b/.gitlab/pnnl/base.gitlab-ci.yml @@ -269,4 +269,4 @@ stages: variables: WORKDIR_SUFFIX: "x86_64-clang-hip-build" MY_CLUSTER: "incline" - SLURM_ARGS: " --exclusive --ntasks=3 " + SLURM_ARGS: " -N 1 --ntasks=3 " diff --git a/.gitlab/pnnl/incline.gitlab-ci.yml b/.gitlab/pnnl/incline.gitlab-ci.yml index afc4fd05..f62b3ad0 100644 --- a/.gitlab/pnnl/incline.gitlab-ci.yml +++ b/.gitlab/pnnl/incline.gitlab-ci.yml @@ -3,15 +3,15 @@ Incline Build: - .cluster_build - .incline variables: - SCRIPT_ARGS: " --build-only " #--job=clang-hip " + SCRIPT_ARGS: " --build-only " Incline Test: extends: - .cluster_test - .incline variables: - SCRIPT_ARGS: " --test-only " #--job=clang-hip " - CTESTARGS: " --timeout 240 --output-on-failure -LE incline-skip " + SCRIPT_ARGS: " --test-only " + CTESTARGS: " --timeout 240 --output-on-failure " needs: ['Incline Build'] pending: diff --git a/CMakeLists.txt b/CMakeLists.txt index db4e8e74..cd99f931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,12 +85,6 @@ if(RESOLVE_USE_HIP) enable_language(HIP) check_language(HIP) include(ReSolveFindHipLibraries) - - # This is just an agly hack to make HIP build work - get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) - message(STATUS "HIP include directories: ${hip_includes}") - # TODO - use targets properly - include_directories(${hip_includes}) else() message(STATUS "Not using HIP") endif(RESOLVE_USE_HIP) @@ -100,18 +94,11 @@ endif(RESOLVE_USE_HIP) configure_file( ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp) - -# include build directory for Fortran name mangling header -# TODO - target based includes -include_directories(${CMAKE_BINARY_DIR}) - install( FILES ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp DESTINATION include/resolve ) -# TODO - fix this -include_directories(${CMAKE_SOURCE_DIR}) # Enable testing enable_testing() diff --git a/CMakePresets.json b/CMakePresets.json index c00f9919..4809aca5 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -17,6 +17,17 @@ "RESOLVE_USE_CUDA": "ON" } }, + { + "name": "rocm", + "displayName": "ROCM build", + "description": "Base config to build with ROCM", + "binaryDir": "${sourceDir}/build", + "installDir": "${sourceDir}/install", + "generator": "Unix Makefiles", + "cacheVariables": { + "RESOLVE_USE_HIP": "ON" + } + }, { "name": "cpu", "displayName": "CPU only build", @@ -25,7 +36,7 @@ "installDir": "${sourceDir}/install", "generator": "Unix Makefiles" }, - { + { "name": "ascent", "inherits": "cuda", "displayName": "Ascent Build", @@ -43,9 +54,18 @@ }, { "name": "incline", - "inherits": "cpu", - "displayName": "Incline CPU only Build", - "description": "Custom changes specific for Incline" + "inherits": "rocm", + "displayName": "Incline Build with rocm", + "description": "Custom changes specific for Incline", + "cacheVariables": { + "CMAKE_HIP_ARCHITECTURES" : "gfx908", + "CMAKE_BUILD_TYPE" : "Debug" + }, + "environment": { + "CC" : "clang", + "CXX" : "clang++", + "FC" : "gfortran" + } } ] diff --git a/buildsystem/incline-env.sh b/buildsystem/incline-env.sh index 3c4e2194..348139ff 100644 --- a/buildsystem/incline-env.sh +++ b/buildsystem/incline-env.sh @@ -1,5 +1,15 @@ +#!/bin/bash + +# Load system rocm source /etc/profile.d/modules.sh module purge module load gcc/8.4.0 -module load rocm/5.3.0 + +# These are necessary in order to see GPUs with sbatch +unset ROCR_VISIBLE_DEVICES +unset CUDA_VISIBLE_DEVICES +unset GPU_DEVICE_ORDINAL + +# Load spack generated modules source ./buildsystem/spack/incline/modules/dependencies.sh + diff --git a/buildsystem/spack/incline/env.sh b/buildsystem/spack/incline/env.sh index 31d03fa4..757cc090 100644 --- a/buildsystem/spack/incline/env.sh +++ b/buildsystem/spack/incline/env.sh @@ -3,19 +3,22 @@ source /etc/profile.d/modules.sh module purge -# Load system python +# Load system python and gcc module load python/miniconda4.12 source /share/apps/python/miniconda4.12/etc/profile.d/conda.sh +module load gcc/8.4.0 # Define environment variables for where spack stores key files # For now, SPACK_INSTALL is the path where everything spack related is installed # If you want to modify the module install path, edit the spack.yaml manually BASE=/qfs/projects/exasgd/resolve/spack-ci export SPACK_INSTALL=$BASE/install +export SPACK_MIRROR=$BASE/../$(whoami)/spack-mirror export SPACK_CACHE=$BASE/../$(whoami)/spack-cache export SPACK_DISABLE_LOCAL_CONFIG=1 -export SPACK_PYTHON=$(which python) +export SPACK_PYTHON=$(which python3) export tempdir=$SPACK_CACHE export TMP=$SPACK_CACHE export TMPDIR=$SPACK_CACHE + diff --git a/buildsystem/spack/incline/install.sh b/buildsystem/spack/incline/install.sh index 6494de6f..392562d8 100755 --- a/buildsystem/spack/incline/install.sh +++ b/buildsystem/spack/incline/install.sh @@ -9,8 +9,13 @@ #SBATCH -e spack_install.%J.output #SBTACH -t 240 +export HTTPS_PROXY=http://proxy01.pnl.gov:3128 +export https_proxy=http://proxy01.pnl.gov:3128 export MY_CLUSTER=incline . buildsystem/load-spack.sh && spack develop --no-clone --path=$(pwd) resolve@develop && +spack concretize -f && +spack install -j 64 llvm-amdgpu && +spack load llvm-amdgpu && ./buildsystem/configure-modules.sh 64 diff --git a/buildsystem/spack/incline/modules/dependencies.sh b/buildsystem/spack/incline/modules/dependencies.sh index 75b564ff..75cf6209 100644 --- a/buildsystem/spack/incline/modules/dependencies.sh +++ b/buildsystem/spack/incline/modules/dependencies.sh @@ -1,24 +1,170 @@ module use -a /qfs/projects/exasgd/resolve/spack-ci/install/modules/linux-centos7-zen +# curl@=7.29.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen +module load curl/7.29.0-gcc-8.4.0-3emq5yx +# gmake@=4.4.1%gcc@=8.4.0~guile build_system=generic arch=linux-centos7-zen +module load gmake/4.4.1-gcc-8.4.0-l7nyr34 # pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen -module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo -# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen -module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g -# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen -module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig -# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen -module load perl/5.26.0-gcc-8.4.0-h324qox -# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen -module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr -# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen -module load openssl/3.1.3-gcc-8.4.0-46yttzm -# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen -module load curl/8.4.0-gcc-8.4.0-g2rrs23 +module load pkgconf/1.9.5-gcc-8.4.0-733ltud # ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen -module load ncurses/6.4-gcc-8.4.0-jt7rpqq +module load ncurses/6.4-gcc-8.4.0-gwo76of +# zlib-ng@=2.1.4%gcc@=8.4.0+compat+opt build_system=autotools arch=linux-centos7-zen +module load zlib-ng/2.1.4-gcc-8.4.0-feah6zt # cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen -module load cmake/3.27.7-gcc-8.4.0-tu2rruq -# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen -module load gmake/4.4.1-gcc-8.4.0-f23wik2 +module load cmake/3.27.7-gcc-8.4.0-rmou7zf +# gmake@=4.4.1%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~guile build_system=generic arch=linux-centos7-zen +module load gmake/4.4.1-clang-16.0.0-rocm5.6.0-6c7b35p +# python@=3.9.12%gcc@=8.4.0+bz2+crypt+ctypes+dbm~debug+libxml2+lzma~nis~optimizations+pic+pyexpat+pythoncmd+readline+shared+sqlite3+ssl~tkinter+uuid+zlib build_system=generic patches=0d98e93,4c24573,ebdca64,f2fd060 arch=linux-centos7-zen +module load python/3.9.12-gcc-8.4.0-ob2n5zs +# re2c@=2.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load re2c/2.2-gcc-8.4.0-zmj4cst +# ninja@=1.11.1%gcc@=8.4.0+re2c build_system=generic arch=linux-centos7-zen +module load ninja/1.11.1-gcc-8.4.0-ofxvwff +# z3@=4.11.2%gcc@=8.4.0~gmp~ipo~python build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load z3/4.11.2-gcc-8.4.0-363odap +# llvm-amdgpu@=5.6.1%gcc@=8.4.0~ipo~link_llvm_dylib~llvm_dylib~openmp+rocm-device-libs build_system=cmake build_type=Release generator=ninja patches=a08bbe1,b66529f,d35aec9 arch=linux-centos7-zen +module load llvm-amdgpu/5.6.1-gcc-8.4.0-vy3wrnq +# rocm-core@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocm-core/5.6.1-gcc-8.4.0-llv2yv4 +# rocm-cmake@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocm-cmake/5.6.1-gcc-8.4.0-klwq5kk +# comgr@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load comgr/5.6.1-gcc-8.4.0-yl7z2re +# mesa@=23.0.2%gcc@=8.4.0+glx+llvm+opengl~opengles+osmesa~strip build_system=meson buildtype=release default_library=shared arch=linux-centos7-zen +module load mesa/23.0.2-gcc-8.4.0-xffioaq +# glx@=1.4%gcc@=8.4.0 build_system=bundle arch=linux-centos7-zen +module load glx/1.4-gcc-8.4.0-vh5g6sx +# hipify-clang@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make patches=54b8b39 arch=linux-centos7-zen +module load hipify-clang/5.6.1-gcc-8.4.0-e3jea5v +# libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen +module load libiconv/1.17-gcc-8.4.0-o2hwfiz +# diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load diffutils/3.9-gcc-8.4.0-7ceszkk +# bzip2@=1.0.8%gcc@=8.4.0~debug~pic+shared build_system=generic arch=linux-centos7-zen +module load bzip2/1.0.8-gcc-8.4.0-on73m5o +# xz@=5.4.1%gcc@=8.4.0~pic build_system=autotools libs=shared,static arch=linux-centos7-zen +module load xz/5.4.1-gcc-8.4.0-v5kymdq +# libxml2@=2.10.3%gcc@=8.4.0+pic~python+shared build_system=autotools arch=linux-centos7-zen +module load libxml2/2.10.3-gcc-8.4.0-6mgqxiy +# pigz@=2.7%gcc@=8.4.0 build_system=makefile arch=linux-centos7-zen +module load pigz/2.7-gcc-8.4.0-btbzuey +# zstd@=1.5.5%gcc@=8.4.0+programs build_system=makefile compression=none libs=shared,static arch=linux-centos7-zen +module load zstd/1.5.5-gcc-8.4.0-3ets7dy +# tar@=1.34%gcc@=8.4.0 build_system=autotools zip=pigz arch=linux-centos7-zen +module load tar/1.34-gcc-8.4.0-atzwdgy +# gettext@=0.22.3%gcc@=8.4.0+bzip2+curses+git~libunistring+libxml2+pic+shared+tar+xz build_system=autotools arch=linux-centos7-zen +module load gettext/0.22.3-gcc-8.4.0-m33ujza +# libsigsegv@=2.14%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libsigsegv/2.14-gcc-8.4.0-gzna4n3 +# m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen +module load m4/1.4.19-gcc-8.4.0-bwzchwl +# elfutils@=0.189%gcc@=8.4.0~debuginfod+exeprefix+nls build_system=autotools arch=linux-centos7-zen +module load elfutils/0.189-gcc-8.4.0-23kjwto +# libtool@=2.4.7%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libtool/2.4.7-gcc-8.4.0-2bmpsy4 +# util-macros@=1.19.3%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load util-macros/1.19.3-gcc-8.4.0-64inrmm +# libpciaccess@=0.17%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libpciaccess/0.17-gcc-8.4.0-sh2c4la +# libpthread-stubs@=0.4%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libpthread-stubs/0.4-gcc-8.4.0-kcav646 +# py-pip@=23.1.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load py-pip/23.1.2-gcc-8.4.0-yajovh7 +# py-wheel@=0.41.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load py-wheel/0.41.2-gcc-8.4.0-dkkw2va +# py-setuptools@=68.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-setuptools/68.0.0-gcc-8.4.0-ihu4sfq +# meson@=1.2.2%gcc@=8.4.0 build_system=python_pip patches=0f0b1bd,ae59765 arch=linux-centos7-zen +module load meson/1.2.2-gcc-8.4.0-vcdwjmb +# libdrm@=2.4.115%gcc@=8.4.0~docs build_system=generic arch=linux-centos7-zen +module load libdrm/2.4.115-gcc-8.4.0-6h77lxh +# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen +module load perl/5.26.0-gcc-8.4.0-6tdzqfd +# autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen +module load autoconf/2.69-gcc-8.4.0-dcrbb7h +# automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load automake/1.16.5-gcc-8.4.0-tvi3cks +# numactl@=2.0.14%gcc@=8.4.0 build_system=autotools patches=4e1d78c,62fc8a8,ff37630 arch=linux-centos7-zen +module load numactl/2.0.14-gcc-8.4.0-7mpcwqq +# hsakmt-roct@=5.6.1%gcc@=8.4.0~ipo+shared build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load hsakmt-roct/5.6.1-gcc-8.4.0-4on3xib +# hsa-rocr-dev@=5.6.1%gcc@=8.4.0~image~ipo+shared build_system=cmake build_type=Release generator=make patches=9267179 arch=linux-centos7-zen +module load hsa-rocr-dev/5.6.1-gcc-8.4.0-tdlpv7w +# perl-file-which@=1.27%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-file-which/1.27-gcc-8.4.0-nix64yx +# perl-module-build@=0.4232%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-module-build/0.4232-gcc-8.4.0-ayed35p +# perl-uri-encode@=1.1.1%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-uri-encode/1.1.1-gcc-8.4.0-biqataj +# py-ply@=3.11%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-ply/3.11-gcc-8.4.0-creftnl +# py-cppheaderparser@=2.7.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-cppheaderparser/2.7.4-gcc-8.4.0-nw7554i +# rocminfo@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocminfo/5.6.1-gcc-8.4.0-5shaxxj +# roctracer-dev-api@=5.6.1%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load roctracer-dev-api/5.6.1-gcc-8.4.0-gbaoh25 +# hip@=5.6.1%gcc@=8.4.0~cuda~ipo+rocm build_system=cmake build_type=Release generator=make patches=aee7249,c2ee21c,e73e91b arch=linux-centos7-zen +module load hip/5.6.1-gcc-8.4.0-zpa2j7f +# msgpack-c@=3.1.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load msgpack-c/3.1.1-gcc-8.4.0-buxbznu +# procps@=4.0.4%gcc@=8.4.0+nls build_system=autotools arch=linux-centos7-zen +module load procps/4.0.4-gcc-8.4.0-gyn6his +# py-joblib@=1.2.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-joblib/1.2.0-gcc-8.4.0-ukcd432 +# py-cython@=0.29.36%gcc@=8.4.0 build_system=python_pip patches=c4369ad arch=linux-centos7-zen +module load py-cython/0.29.36-gcc-8.4.0-5f4zyzb +# py-msgpack@=1.0.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-msgpack/1.0.5-gcc-8.4.0-2xh5udm +# libyaml@=0.2.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libyaml/0.2.5-gcc-8.4.0-hidc7bw +# py-pyyaml@=6.0%gcc@=8.4.0+libyaml build_system=python_pip arch=linux-centos7-zen +module load py-pyyaml/6.0-gcc-8.4.0-4mdsdw2 +# py-distlib@=0.3.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-distlib/0.3.7-gcc-8.4.0-f25ay4b +# py-editables@=0.3%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-editables/0.3-gcc-8.4.0-hrmamrk +# py-flit-core@=3.9.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-flit-core/3.9.0-gcc-8.4.0-q3yng6k +# py-packaging@=23.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-packaging/23.1-gcc-8.4.0-7krugqt +# py-pathspec@=0.11.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-pathspec/0.11.1-gcc-8.4.0-vm5freh +# git@=2.42.0%gcc@=8.4.0+man+nls+perl+subtree~svn~tcltk build_system=autotools arch=linux-centos7-zen +module load git/2.42.0-gcc-8.4.0-k5crf2q +# py-tomli@=2.0.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-tomli/2.0.1-gcc-8.4.0-m4gh2nb +# py-typing-extensions@=4.8.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-typing-extensions/4.8.0-gcc-8.4.0-ovqdpbs +# py-setuptools-scm@=7.1.0%gcc@=8.4.0+toml build_system=python_pip arch=linux-centos7-zen +module load py-setuptools-scm/7.1.0-gcc-8.4.0-hqzn5lb +# py-pluggy@=1.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-pluggy/1.0.0-gcc-8.4.0-lqpf66l +# py-calver@=2022.6.26%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-calver/2022.6.26-gcc-8.4.0-pm6rj2c +# py-trove-classifiers@=2023.8.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-trove-classifiers/2023.8.7-gcc-8.4.0-iy66qnh +# py-hatchling@=1.18.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-hatchling/1.18.0-gcc-8.4.0-bjpjiiq +# py-hatch-vcs@=0.3.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-hatch-vcs/0.3.0-gcc-8.4.0-hc6rq3a +# py-filelock@=3.12.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-filelock/3.12.4-gcc-8.4.0-rzqmlrq +# py-platformdirs@=3.10.0%gcc@=8.4.0~wheel build_system=python_pip arch=linux-centos7-zen +module load py-platformdirs/3.10.0-gcc-8.4.0-6hnyp7h +# py-virtualenv@=20.24.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-virtualenv/20.24.5-gcc-8.4.0-h4mzkzl +# rocblas@=5.6.1%gcc@=8.4.0~ipo+tensile amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocblas/5.6.1-gcc-8.4.0-arsno2b +# fmt@=10.1.1%gcc@=8.4.0~ipo+pic~shared build_system=cmake build_type=Release cxxstd=11 generator=make arch=linux-centos7-zen +module load fmt/10.1.1-gcc-8.4.0-4d5ehr5 +# rocprim@=5.6.1%gcc@=8.4.0~ipo amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocprim/5.6.1-gcc-8.4.0-nu465tt +# rocsparse@=5.6.1%gcc@=8.4.0~ipo~test amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocsparse/5.6.1-gcc-8.4.0-wtmfgyn +# rocsolver@=5.6.1%gcc@=8.4.0~ipo+optimal amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocsolver/5.6.1-gcc-8.4.0-wlgpkqj +# roctracer-dev@=5.6.1%gcc@=8.4.0~ipo~rocm build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load roctracer-dev/5.6.1-gcc-8.4.0-lilld4h # libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen module load libiconv/1.17-gcc-8.4.0-wfdnlg6 # diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen @@ -27,6 +173,8 @@ module load diffutils/3.9-gcc-8.4.0-qh566r6 module load libsigsegv/2.14-gcc-8.4.0-iutj4de # m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen module load m4/1.4.19-gcc-8.4.0-x7ktvaf +# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen +module load perl/5.26.0-gcc-8.4.0-h324qox # autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen module load autoconf/2.69-gcc-8.4.0-npluk5j # automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen @@ -35,6 +183,24 @@ module load automake/1.16.5-gcc-8.4.0-tgloywk module load libtool/2.4.7-gcc-8.4.0-gs6gyy3 # gmp@=6.2.1%gcc@=8.4.0+cxx build_system=autotools libs=shared,static patches=69ad2e2 arch=linux-centos7-zen module load gmp/6.2.1-gcc-8.4.0-ythx4o2 +# pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo +# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g +# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig +# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen +module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr +# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen +module load openssl/3.1.3-gcc-8.4.0-46yttzm +# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen +module load curl/8.4.0-gcc-8.4.0-g2rrs23 +# ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen +module load ncurses/6.4-gcc-8.4.0-jt7rpqq +# cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen +module load cmake/3.27.7-gcc-8.4.0-tu2rruq +# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen +module load gmake/4.4.1-gcc-8.4.0-f23wik2 # metis@=5.1.0%gcc@=8.4.0~gdb~int64~ipo~real64+shared build_system=cmake build_type=Release generator=make patches=4991da9,93a7903,b1225da arch=linux-centos7-zen module load metis/5.1.0-gcc-8.4.0-gsllf6a # autoconf-archive@=2023.02.20%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen @@ -61,5 +227,5 @@ module load mpfr/4.2.0-gcc-8.4.0-cjhi2el module load openblas/0.3.24-gcc-8.4.0-4ei4hpg # suite-sparse@=5.13.0%gcc@=8.4.0~cuda~graphblas~openmp+pic build_system=generic arch=linux-centos7-zen module load suite-sparse/5.13.0-gcc-8.4.0-ivey23b -# resolve@=develop%gcc@=8.4.0~cuda~ipo+klu build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen -## module load resolve/develop-gcc-8.4.0-ugoj3p3 +# resolve@=develop%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~cuda~ipo+klu+rocm amdgpu_target=gfx908 build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen +## module load resolve/develop-clang-16.0.0-rocm5.6.0-6kaaut4 diff --git a/buildsystem/spack/incline/spack.yaml b/buildsystem/spack/incline/spack.yaml index 36234ce0..894daf7c 100644 --- a/buildsystem/spack/incline/spack.yaml +++ b/buildsystem/spack/incline/spack.yaml @@ -1,10 +1,35 @@ spack: specs: - - resolve~cuda%gcc@8.4.0 + - resolve~cuda+rocm%clang@16.0.0-rocm5.6.0 amdgpu_target=gfx908 + ^ llvm-amdgpu%gcc + ^ hsa-rocr-dev~image view: false concretizer: - unify: when_possible reuse: true + unify: true + compilers: + - compiler: + spec: gcc@8.4.0 + paths: + cc: /share/apps/gcc/8.4.0/bin/gcc + cxx: /share/apps/gcc/8.4.0/bin/g++ + f77: /share/apps/gcc/8.4.0/bin/gfortran + fc: /share/apps/gcc/8.4.0/bin/gfortran + operating_system: centos7 + target: x86_64 + modules: [gcc/8.4.0] + - compiler: + spec: clang@16.0.0-rocm5.6.0 + paths: + cc: amdclang + cxx: amdclang++ + f77: /share/apps/gcc/8.4.0/bin/gfortran + fc: /share/apps/gcc/8.4.0/bin/gfortran + flags: + cxxflags: --gcc-toolchain=/share/apps/gcc/8.4.0 + operating_system: centos7 + target: x86_64 + modules: [] config: concretizer: clingo install_tree: @@ -23,20 +48,39 @@ spack: write: group read: world group: exasgd + mesa: + externals: + - spec: mesa@23.0.2+glx + prefix: /usr + buildable: false + curl: + externals: + - spec: curl@7.29.0 + prefix: /usr/bin/curl + buildable: false + git: + externals: + - spec: git@2.42.0 + prefix: /share/apps/git/2.42.0 + modules: + - git/2.42.0 + buildable: false + lua: + externals: + - spec: lua@5.4.2 + modules: + - lua/5.4.2 + buildable: false + python: + externals: + - spec: python@3.9.12%gcc + modules: + - python/miniconda4.12 + buildable: false perl: externals: - spec: perl@5.26.0 modules: - perl/5.26.0 buildable: false - compilers: - - compiler: - spec: gcc@8.4.0 - paths: - cc: /share/apps/gcc/8.4.0/bin/gcc - cxx: /share/apps/gcc/8.4.0/bin/g++ - f77: /share/apps/gcc/8.4.0/bin/gfortran - fc: /share/apps/gcc/8.4.0/bin/gfortran - operating_system: centos7 - target: x86_64 - modules: [gcc/8.4.0] + diff --git a/buildsystem/spack/spack b/buildsystem/spack/spack index 7e466f7d..f120cada 160000 --- a/buildsystem/spack/spack +++ b/buildsystem/spack/spack @@ -1 +1 @@ -Subproject commit 7e466f7d22839f034b1e542daf5d2b6ef8c568c4 +Subproject commit f120cada59dbc5115d94c2fce3cbffc946b72bb0 diff --git a/cmake/ReSolveConfig.cmake.in b/cmake/ReSolveConfig.cmake.in index 47f9fe35..fd73d0c8 100644 --- a/cmake/ReSolveConfig.cmake.in +++ b/cmake/ReSolveConfig.cmake.in @@ -4,6 +4,10 @@ include("${CMAKE_CURRENT_LIST_DIR}/ReSolveTargets.cmake") +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD @CMAKE_CXX_STANDARD@) +endif() + include(CheckLanguage) # This must come before enable_language(CUDA) if(@RESOLVE_USE_CUDA@) @@ -15,14 +19,19 @@ if(@RESOLVE_USE_CUDA@) add_library(ReSolve::CUDA ALIAS ReSolve::resolve_backend_cuda) endif() if(@RESOLVE_USE_HIP@) + # TODO - This is a bit heavy-handed, but otherwise you get gcc which is not ideal + # - if(NOT CMAKE_C_COMPILER) wasn't working at top of file... + set(CMAKE_C_COMPILER @CMAKE_C_COMPILER@) + set(CMAKE_CXX_COMPILER @CMAKE_CXX_COMPILER@) enable_language(HIP) check_language(HIP) find_package(hip REQUIRED) - find_package(hipblas REQUIRED) + find_package(rocblas REQUIRED) + find_package(rocsparse REQUIRED) + find_package(rocsolver REQUIRED) # This is just an agly hack to make HIP build work get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) - message(STATUS "HIP include directories: ${hip_includes}") - include_directories(${hip_includes}) + target_include_directories(ReSolve::resolve_hip INTERFACE $) add_library(ReSolve::HIP ALIAS ReSolve::resolve_backend_hip) endif() diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake index d0d22395..b23d8021 100644 --- a/cmake/ReSolveFindHipLibraries.cmake +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -6,13 +6,20 @@ add_library(resolve_hip INTERFACE) find_package(hip REQUIRED) find_package(rocblas REQUIRED) find_package(rocsparse REQUIRED) +find_package(rocsolver REQUIRED) -target_link_libraries(resolve_hip INTERFACE +target_link_libraries(resolve_hip INTERFACE hip::host hip::device roc::rocblas roc::rocsparse - rocsolver + roc::rocsolver ) +get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) + +target_include_directories(resolve_hip INTERFACE + $) + install(TARGETS resolve_hip EXPORT ReSolveTargets) + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 9113ce17..faa53807 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -71,8 +71,11 @@ set(CONSUMER_PATH ${CMAKE_INSTALL_PREFIX}/share/examples) install(PROGRAMS test.sh DESTINATION ${CONSUMER_PATH}) # Select consumer app +# TODO - have an outer loop that adds a unique consumer test for each backend supproted if(RESOLVE_USE_CUDA) set(RESOLVE_CONSUMER_APP "testKLU_Rf_FGMRES.cpp") +elseif(RESOLVE_USE_HIP) + set(RESOLVE_CONSUMER_APP "testKLU_RocSolver.cpp") else() set(RESOLVE_CONSUMER_APP "testKLU.cpp") endif() diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 47ce70de..b98c8234 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -36,6 +36,7 @@ set(ReSolve_HEADER_INSTALL LinSolver.hpp LinSolverDirectCuSolverGLU.hpp LinSolverDirectCuSolverRf.hpp + LinSolverDirectRocSolverRf.hpp LinSolverDirectKLU.hpp LinSolverIterativeFGMRES.hpp RefactorizationSolver.hpp diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp index bb623fb2..97c95526 100644 --- a/resolve/LinSolverDirectRocSolverRf.hpp +++ b/resolve/LinSolverDirectRocSolverRf.hpp @@ -8,7 +8,6 @@ #include #include #include -#include namespace ReSolve { diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt index 7105655c..16455315 100644 --- a/resolve/cpu/CMakeLists.txt +++ b/resolve/cpu/CMakeLists.txt @@ -19,10 +19,5 @@ set(ReSolve_CPU_HEADER_INSTALL add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC}) target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger) -target_include_directories(resolve_backend_cpu INTERFACE - $ - $ -) - # install include headers install(FILES ${ReSolve_CPU_HEADER_INSTALL} DESTINATION include/resolve/cpu) diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt index f97267bc..225ea3c6 100644 --- a/resolve/cuda/CMakeLists.txt +++ b/resolve/cuda/CMakeLists.txt @@ -27,10 +27,7 @@ set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA) add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC}) target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger) target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda) -target_include_directories(resolve_backend_cuda INTERFACE - $ - $ -) # install include headers install(FILES ${ReSolve_CUDA_HEADER_INSTALL} DESTINATION include/resolve/cuda) + diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt index f8d7a457..fb71a3bd 100644 --- a/resolve/hip/CMakeLists.txt +++ b/resolve/hip/CMakeLists.txt @@ -13,10 +13,10 @@ set(ReSolve_HIP_SRC ) set(ReSolve_HIP_HEADER_INSTALL - # hipKernels.h + hipKernels.h hipVectorKernels.h HipMemory.hpp - # hip_check_errors.hpp + hip_check_errors.hpp ) set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP) @@ -27,11 +27,7 @@ set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP) add_library(resolve_backend_hip SHARED ${ReSolve_HIP_SRC}) target_link_libraries(resolve_backend_hip PRIVATE resolve_logger) target_link_libraries(resolve_backend_hip PUBLIC resolve_hip) -#target_include_directories(resolve_backend_hip PUBLIC ${hip_includes}) -target_include_directories(resolve_backend_hip INTERFACE - $ - $ -) # install include headers install(FILES ${ReSolve_HIP_HEADER_INSTALL} DESTINATION include/resolve/hip) + diff --git a/resolve/utilities/logger/CMakeLists.txt b/resolve/utilities/logger/CMakeLists.txt index 91b29dfc..29800942 100644 --- a/resolve/utilities/logger/CMakeLists.txt +++ b/resolve/utilities/logger/CMakeLists.txt @@ -17,8 +17,9 @@ set(Logger_HEADER_INSTALL # Build shared library ReSolve add_library(resolve_logger SHARED ${Logger_SRC}) -target_include_directories(resolve_logger INTERFACE - $ +target_include_directories(resolve_logger PUBLIC + $ + $ $ ) diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt index a34c2191..a44f74f8 100644 --- a/resolve/workspace/CMakeLists.txt +++ b/resolve/workspace/CMakeLists.txt @@ -47,9 +47,10 @@ if(RESOLVE_USE_HIP) target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip) endif(RESOLVE_USE_HIP) -target_include_directories(resolve_workspace INTERFACE - $ - $ +target_include_directories(resolve_workspace PUBLIC + $ + $ + $ ) # install include headers From 68cb4f7dc481dbd863cfac04f6be5f0863f7b573 Mon Sep 17 00:00:00 2001 From: Cameron Rutherford Date: Fri, 3 Nov 2023 12:31:14 -0400 Subject: [PATCH 12/12] Fix incline variables after merge --- .../spack/incline/modules/dependencies.sh | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/buildsystem/spack/incline/modules/dependencies.sh b/buildsystem/spack/incline/modules/dependencies.sh index c1c55496..75cf6209 100644 --- a/buildsystem/spack/incline/modules/dependencies.sh +++ b/buildsystem/spack/incline/modules/dependencies.sh @@ -199,6 +199,33 @@ module load curl/8.4.0-gcc-8.4.0-g2rrs23 module load ncurses/6.4-gcc-8.4.0-jt7rpqq # cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen module load cmake/3.27.7-gcc-8.4.0-tu2rruq +# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen +module load gmake/4.4.1-gcc-8.4.0-f23wik2 +# metis@=5.1.0%gcc@=8.4.0~gdb~int64~ipo~real64+shared build_system=cmake build_type=Release generator=make patches=4991da9,93a7903,b1225da arch=linux-centos7-zen +module load metis/5.1.0-gcc-8.4.0-gsllf6a +# autoconf-archive@=2023.02.20%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load autoconf-archive/2023.02.20-gcc-8.4.0-ox4hxoe +# bzip2@=1.0.8%gcc@=8.4.0~debug~pic+shared build_system=generic arch=linux-centos7-zen +module load bzip2/1.0.8-gcc-8.4.0-3uzyl47 +# xz@=5.4.1%gcc@=8.4.0~pic build_system=autotools libs=shared,static arch=linux-centos7-zen +module load xz/5.4.1-gcc-8.4.0-dwmuagy +# libxml2@=2.10.3%gcc@=8.4.0+pic~python+shared build_system=autotools arch=linux-centos7-zen +module load libxml2/2.10.3-gcc-8.4.0-2hu4ayt +# pigz@=2.7%gcc@=8.4.0 build_system=makefile arch=linux-centos7-zen +module load pigz/2.7-gcc-8.4.0-lu7bjb6 +# zstd@=1.5.5%gcc@=8.4.0+programs build_system=makefile compression=none libs=shared,static arch=linux-centos7-zen +module load zstd/1.5.5-gcc-8.4.0-z7jmyvw +# tar@=1.34%gcc@=8.4.0 build_system=autotools zip=pigz arch=linux-centos7-zen +module load tar/1.34-gcc-8.4.0-wcgempy +# gettext@=0.22.3%gcc@=8.4.0+bzip2+curses+git~libunistring+libxml2+pic+shared+tar+xz build_system=autotools arch=linux-centos7-zen +module load gettext/0.22.3-gcc-8.4.0-f7dl6un +# texinfo@=7.0.3%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load texinfo/7.0.3-gcc-8.4.0-jma4obj +# mpfr@=4.2.0%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen +module load mpfr/4.2.0-gcc-8.4.0-cjhi2el +# openblas@=0.3.24%gcc@=8.4.0~bignuma~consistent_fpcsr+fortran~ilp64+locking+pic+shared build_system=makefile symbol_suffix=none threads=none arch=linux-centos7-zen +module load openblas/0.3.24-gcc-8.4.0-4ei4hpg +# suite-sparse@=5.13.0%gcc@=8.4.0~cuda~graphblas~openmp+pic build_system=generic arch=linux-centos7-zen +module load suite-sparse/5.13.0-gcc-8.4.0-ivey23b # resolve@=develop%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~cuda~ipo+klu+rocm amdgpu_target=gfx908 build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen ## module load resolve/develop-clang-16.0.0-rocm5.6.0-6kaaut4 -