diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake index 83b7c220..e754da0d 100644 --- a/cmake/ReSolveFindHipLibraries.cmake +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -9,6 +9,8 @@ find_package(hipblas REQUIRED) target_link_libraries(resolve_hip INTERFACE #hip::host hip::device + rocblas + rocsparse #roc::hipblas ) diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt index f0a93b04..f8d7a457 100644 --- a/resolve/hip/CMakeLists.txt +++ b/resolve/hip/CMakeLists.txt @@ -7,14 +7,14 @@ ]] set(ReSolve_HIP_SRC - # hipKernels.cu + hipKernels.hip hipVectorKernels.hip MemoryUtils.hip ) set(ReSolve_HIP_HEADER_INSTALL # hipKernels.h - # hipVectorKernels.h + hipVectorKernels.h HipMemory.hpp # hip_check_errors.hpp ) diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h new file mode 100644 index 00000000..9c48783a --- /dev/null +++ b/resolve/hip/hipKernels.h @@ -0,0 +1,14 @@ +void mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result); +void mass_axpy(int n, int i, double* x, double* y, double* alpha); + +//needed for matrix inf nrm +void matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip new file mode 100644 index 00000000..13f53d85 --- /dev/null +++ b/resolve/hip/hipKernels.hip @@ -0,0 +1,167 @@ +#include "hipKernels.h" +#define maxk 1024 +#define Tv5 1024 + +#include + +//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 +__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, + double* result, + const int k, + const int N) +{ + int t = threadIdx.x; + int bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ double s_tmp1[Tv5]; + + volatile __shared__ double s_tmp2[Tv5]; + // map between thread index space and the problem index space + int j = blockIdx.x; + s_tmp1[t] = 0.0f; + s_tmp2[t] = 0.0f; + int nn = t; + double can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } +} + + +//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + +__global__ void massAxpy3_kernel(int N, + int k, + const double* x_data, + double* y_data, + const double* alpha) { + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int t = threadIdx.x; + + __shared__ double s_alpha[maxk]; + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + while (i < N){ + double temp = 0.0; + for(int j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + i += (blockDim.x*gridDim.x); + } +} +__global__ void matrixInfNormPart1(const int n, + const int nnz, + const int* a_ia, + const double* a_val, + double* result) { + + // one thread per row, pass through rows + // and sum + // can be done through atomics + //\sum_{j=1}^m abs(a_{ij}) + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n){ + double sum = 0.0f; + for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x*gridDim.x); + } +} + + +void mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result) +{ + hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n); +} +void mass_axpy(int n, int i, double* x, double* y, double* alpha) +{ + hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha); +} + +void matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result) +{ + hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result); +} diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip index 3df2b84b..f68cd0b9 100644 --- a/resolve/hip/hipVectorKernels.hip +++ b/resolve/hip/hipVectorKernels.hip @@ -1,29 +1,28 @@ #include #include - -#include "hipVectorKernels.h" +#include namespace ReSolve { namespace vector { namespace kernels { -__global__ void set_const(index_type n, real_type val, real_type* arr) -{ - index_type i = blockIdx.x * blockDim.x + threadIdx.x; - if(i < n) + __global__ void set_const(index_type n, real_type val, real_type* arr) { - arr[i] = val; + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + while (i < n) + { + arr[i] = val; + i += blockDim.x * gridDim.x; + } } -} - } // namespace kernels -void set_array_const(index_type n, real_type val, real_type* arr) +void set_array_const(index_type n, real_type val, real_type* arr) { - index_type num_blocks; - index_type block_size = 512; - num_blocks = (n + block_size - 1) / block_size; - kernels::set_const<<>>(n, val, arr); + index_type num_blocks; + index_type block_size = 512; + num_blocks = (n + block_size - 1) / block_size; + hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr); } -}} // namespace ReSolve::vector \ No newline at end of file +}} // namespace ReSolve::vector diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 554c0ba7..565fa7c9 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC MatrixHandlerCuda.cpp ) +# and on HIP +set(Matrix_ROCM_SRC + MatrixHandlerHip.cpp +) + # Header files to be installed set(Matrix_HEADER_INSTALL io.hpp @@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA) set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC}) +endif() + # Build shared library ReSolve::matrix add_library(resolve_matrix SHARED ${Matrix_SRC}) @@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip) +endif() + # Link to dummy device backend if GPU support is not enabled if (NOT RESOLVE_USE_GPU) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index c8caebf6..a91f94a9 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -33,8 +33,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -48,8 +48,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -63,8 +63,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -81,9 +81,9 @@ namespace ReSolve setNotUpdated(); int control=-1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -98,7 +98,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); @@ -120,7 +120,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -128,7 +128,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -136,7 +136,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpua mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -176,7 +176,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -215,7 +215,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index 1a305e03..e2ea765f 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -30,8 +30,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -45,8 +45,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -60,8 +60,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -77,9 +77,9 @@ namespace ReSolve int control=-1; setNotUpdated(); if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -94,7 +94,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); @@ -116,7 +116,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -124,7 +124,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -132,7 +132,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -173,7 +173,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -212,7 +212,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index f1ddd31f..dff33b48 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -30,8 +30,8 @@ namespace ReSolve copyData("cpu"); return this->h_row_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_row_data_; } else { return nullptr; @@ -45,8 +45,8 @@ namespace ReSolve copyData("cpu"); return this->h_col_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_col_data_; } else { return nullptr; @@ -60,8 +60,8 @@ namespace ReSolve copyData("cpu"); return this->h_val_data_; } else { - if (memspace == "cuda") { - copyData("cuda"); + if ((memspace == "cuda") || (memspace == "hip")) { + copyData(memspace); return this->d_val_data_; } else { return nullptr; @@ -77,9 +77,9 @@ namespace ReSolve setNotUpdated(); int control = -1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if (memspaceOut == "cpu") { //check if cpu data allocated @@ -94,7 +94,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); @@ -118,7 +118,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -126,7 +126,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -134,7 +134,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -174,7 +174,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -212,7 +212,7 @@ namespace ReSolve return 0; } - if (memspaceOut == "cuda") { + if ((memspaceOut == "cuda") || (memspaceOut == "hip")) { if ((d_data_updated_ == false) && (h_data_updated_ == true)) { if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 8bf4302c..133a09f9 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -13,6 +13,9 @@ #ifdef RESOLVE_USE_CUDA #include "MatrixHandlerCuda.hpp" #endif +#ifdef RESOLVE_USE_HIP +#include "MatrixHandlerHip.hpp" +#endif namespace ReSolve { // Create a shortcut name for Logger static class @@ -41,6 +44,7 @@ namespace ReSolve { { if (isCpuEnabled_) delete cpuImpl_; if (isCudaEnabled_) delete cudaImpl_; + if (isHipEnabled_) delete hipImpl_; } /** @@ -74,12 +78,31 @@ namespace ReSolve { } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief Constructor taking pointer to the CUDA workspace as its parameter. + * + * @post A CPU implementation instance is created because it is cheap and + * it does not require a workspace. + * + * @post A HIP implementation instance is created with supplied workspace. + */ + MatrixHandler::MatrixHandler(LinAlgWorkspaceHIP* new_workspace) + { + cpuImpl_ = new MatrixHandlerCpu(); + hipImpl_ = new MatrixHandlerHip(new_workspace); + isCpuEnabled_ = true; + isHipEnabled_ = true; + } +#endif void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace) { if (memspace == "cpu") { cpuImpl_->setValuesChanged(isValuesChanged); } else if (memspace == "cuda") { cudaImpl_->setValuesChanged(isValuesChanged); + } else if (memspace == "hip") { + hipImpl_->setValuesChanged(isValuesChanged); } else { out::error() << "Unsupported device " << memspace << "\n"; } @@ -230,6 +253,8 @@ namespace ReSolve { } else { if (memspace == "cuda"){ A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); + } else if (memspace == "hip"){ + A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); } else { //display error } @@ -269,6 +294,9 @@ namespace ReSolve { return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else if (memspace == "cpu") { return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); + } else if (memspace == "hip") { + printf("about to run mv"); + return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else { out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl; return 1; @@ -280,6 +308,8 @@ namespace ReSolve { { if (memspace == "cuda") { return cudaImpl_->csc2csr(A_csc, A_csr); + } else if (memspace == "hip") { + return hipImpl_->csc2csr(A_csc, A_csr); } else if (memspace == "cpu") { out::warning() << "Using untested csc2csr on CPU ..." << std::endl; return cpuImpl_->csc2csr(A_csc, A_csr); diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index 398a8039..cec61085 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -18,6 +18,7 @@ namespace ReSolve } class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; class MatrixHandlerImpl; } @@ -48,6 +49,7 @@ namespace ReSolve { MatrixHandler(); MatrixHandler(LinAlgWorkspaceCpu* workspace); MatrixHandler(LinAlgWorkspaceCUDA* workspace); + MatrixHandler(LinAlgWorkspaceHIP* workspace); ~MatrixHandler(); int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); @@ -70,9 +72,11 @@ namespace ReSolve { MemoryHandler mem_; ///< Device memory manager object MatrixHandlerImpl* cpuImpl_{nullptr}; ///< Pointer to CPU implementation MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation + MatrixHandlerImpl* hipImpl_{nullptr}; ///< Pointer to HIP implementation bool isCpuEnabled_{false}; ///< true if CPU implementation is instantiated bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated + bool isHipEnabled_{false}; ///< true if HIP implementation is instantiated }; } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp new file mode 100644 index 00000000..370849fa --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -0,0 +1,154 @@ +#include + +#include +#include +#include +#include +#include +#include +#include "MatrixHandlerHip.hpp" + +namespace ReSolve { + // Create a shortcut name for Logger static class + using out = io::Logger; + + MatrixHandlerHip::~MatrixHandlerHip() + { + } + + MatrixHandlerHip::MatrixHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + void MatrixHandlerHip::setValuesChanged(bool values_changed) + { + values_changed_ = values_changed; + } + + + int MatrixHandlerHip::matvec(matrix::Sparse* Ageneric, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrixFormat) + { + using namespace constants; + int error_sum = 0; + if (matrixFormat == "csr") { + matrix::Csr* A = dynamic_cast(Ageneric); + //result = alpha *A*x + beta * result + rocsparse_status status; + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + + rocsparse_handle handle_rocsparse = workspaceHIP->getRocsparseHandle(); + + rocsparse_mat_info infoA = workspaceHIP->getSpmvMatrixInfo(); + rocsparse_mat_descr descrA = workspaceHIP->getSpmvMatrixDescriptor(); + + if (!workspaceHIP->matvecSetup()) { + //setup first, allocate, etc. + + rocsparse_create_mat_descr(&(descrA)); + rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero); + rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general); + + rocsparse_create_mat_info(&infoA); + + status = rocsparse_dcsrmv_analysis(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + descrA, + A->getValues("cuda"), + A->getRowData("cuda"), + A->getColData("cuda"), // cuda is used as "device" + infoA); + error_sum += status; + mem_.deviceSynchronize(); + + workspaceHIP->matvecSetupDone(); + } + + status = rocsparse_dcsrmv(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + alpha, + descrA, + A->getValues("cuda"), + A->getRowData("cuda"), + A->getColData("cuda"), + infoA, + vec_x->getData("cuda"), + beta, + vec_result->getData("cuda")); + + error_sum += status; + mem_.deviceSynchronize(); + if (status) + out::error() << "Matvec status: " << status + << "Last error code: " << mem_.getLastDeviceError() << std::endl; + vec_result->setDataUpdated("cuda"); + + return error_sum; + } else { + out::error() << "MatVec not implemented (yet) for " + << matrixFormat << " matrix format." << std::endl; + return 1; + } + } + + int MatrixHandlerHip::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */) + { + return -1; + } + + int MatrixHandlerHip::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + index_type error_sum = 0; + LinAlgWorkspaceHIP* workspaceHIP = (LinAlgWorkspaceHIP*) workspace_; + + rocsparse_status status; + + A_csr->allocateMatrixData("cuda"); + index_type n = A_csc->getNumRows(); + index_type m = A_csc->getNumRows(); + index_type nnz = A_csc->getNnz(); + size_t bufferSize; + void* d_work; + + status = rocsparse_csr2csc_buffer_size(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + rocsparse_action_numeric, + &bufferSize); + + error_sum += status; + mem_.allocateBufferOnDevice(&d_work, bufferSize); + + status = rocsparse_dcsr2csc(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getValues("cuda"), + A_csc->getColData("cuda"), + A_csc->getRowData("cuda"), + A_csr->getValues("cuda"), + A_csr->getRowData("cuda"), + A_csr->getColData("cuda"), + rocsparse_action_numeric, + rocsparse_index_base_zero, + d_work); + error_sum += status; + return error_sum; + mem_.deleteOnDevice(d_work); + } + +} // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp new file mode 100644 index 00000000..7f06f3bd --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } + class LinAlgWorkspaceHIP; +} + + +namespace ReSolve { + /** + * @class MatrixHandlerHip + * + * @brief HIP implementation of the matrix handler. + */ + class MatrixHandlerHip : public MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + + MatrixHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~MatrixHandlerHip(); + + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); + + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); + + virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); + + void setValuesChanged(bool isValuesChanged); + + private: + + LinAlgWorkspaceHIP* workspace_{nullptr}; + bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object + }; + +} // namespace ReSolve + diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt index 16d53010..89b1abc8 100644 --- a/resolve/vector/CMakeLists.txt +++ b/resolve/vector/CMakeLists.txt @@ -18,6 +18,13 @@ set(Vector_CUDASDK_SRC VectorHandlerCuda.cpp ) +#and hip + +set(Vector_ROCM_SRC + VectorHandlerHip.cpp +) + + # Header files to be installed set(Vector_HEADER_INSTALL Vector.hpp @@ -30,6 +37,11 @@ if(RESOLVE_USE_CUDA) set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC}) endif() +# and hip +if(RESOLVE_USE_HIP) + set(Vector_SRC ${Vector_SRC} ${Vector_ROCM_SRC}) +endif() + add_library(resolve_vector SHARED ${Vector_SRC}) target_link_libraries(resolve_vector PRIVATE resolve_logger) @@ -38,6 +50,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_vector PUBLIC resolve_backend_hip) +endif() + # If no GPU is enabled link to dummy device backend if(NOT RESOLVE_USE_GPU) target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index 7934e8b0..37779ea5 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -60,7 +60,7 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { d_data_ = data; gpu_updated_ = true; cpu_updated_ = false; @@ -76,7 +76,7 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { gpu_updated_ = true; cpu_updated_ = false; } else { @@ -89,15 +89,15 @@ namespace ReSolve { namespace vector { { int control=-1; if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;} if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -109,19 +109,19 @@ namespace ReSolve { namespace vector { cpu_updated_ = true; gpu_updated_ = false; break; - case 2: //cuda->cpu + case 2: //gpu->cpu mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_); owns_gpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; break; - case 1: //cpu->cuda + case 1: //cpu->gpu mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; cpu_updated_ = false; break; - case 3: //cuda->cuda + case 3: //gpu->gpu mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; @@ -141,18 +141,18 @@ namespace ReSolve { namespace vector { real_type* Vector::getData(index_type i, std::string memspace) { if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) { - copyData("cuda", "cpu"); + copyData(memspace, "cpu"); owns_cpu_data_ = true; } - if ((memspace == "cuda") && (gpu_updated_ == false) && (cpu_updated_ == true )) { - copyData("cpu", "cuda"); + if (((memspace == "cuda") || (memspace == "hip")) && (gpu_updated_ == false) && (cpu_updated_ == true )) { + copyData("cpu", memspace); owns_gpu_data_ = true; } if (memspace == "cpu") { return &h_data_[i * n_current_]; } else { - if (memspace == "cuda"){ + if ((memspace == "cuda") || (memspace == "hip")){ return &d_data_[i * n_current_]; } else { return nullptr; @@ -164,14 +164,14 @@ namespace ReSolve { namespace vector { int Vector::copyData(std::string memspaceIn, std::string memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 0;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 1;} + if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 0;} + if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 1;} if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){ //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } @@ -200,10 +200,12 @@ namespace ReSolve { namespace vector { h_data_ = new real_type[n_ * k_]; owns_cpu_data_ = true; } else { - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { mem_.deleteOnDevice(d_data_); mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; + } else { + std::cout<<"wrong memspace " < #endif +#ifdef RESOLVE_USE_HIP +#include +#endif namespace ReSolve { using out = io::Logger; @@ -50,6 +53,21 @@ namespace ReSolve { isCpuEnabled_ = true; } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace) + { + hipImpl_ = new VectorHandlerHip(new_workspace); + cpuImpl_ = new VectorHandlerCpu(); + + isHipEnabled_ = true; + isCpuEnabled_ = true; + } +#endif /** * @brief destructor @@ -64,7 +82,7 @@ namespace ReSolve { * * @param[in] x The first vector * @param[in] y The second vector - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @return dot product (real number) of _x_ and _y_ */ @@ -74,7 +92,9 @@ namespace ReSolve { if (memspace == "cuda" ) { return cudaImpl_->dot(x, y); } else { - if (memspace == "cpu") { + if (memspace == "hip") { + return hipImpl_->dot(x, y); + } else if (memspace == "cpu") { return cpuImpl_->dot(x, y); } else { out::error() << "Not implemented (yet)" << std::endl; @@ -88,13 +108,15 @@ namespace ReSolve { * * @param[in] alpha The constant * @param[in,out] x The vector - * @param memspace string containg memspace (cpu or cuda) + * @param memspace string containg memspace (cpu or cuda or hip) * */ void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace) { if (memspace == "cuda" ) { cudaImpl_->scal(alpha, x); + } else if (memspace == "hip") { + hipImpl_->scal(alpha, x); } else { if (memspace == "cpu") { cpuImpl_->scal(alpha, x); @@ -110,7 +132,7 @@ namespace ReSolve { * @param[in] alpha The constant * @param[in] x The first vector * @param[in,out] y The second vector (result is return in y) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * */ void VectorHandler::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace) @@ -119,10 +141,14 @@ namespace ReSolve { if (memspace == "cuda" ) { cudaImpl_->axpy(alpha, x, y); } else { - if (memspace == "cpu") { - cpuImpl_->axpy(alpha, x, y); + if (memspace == "hip" ) { + hipImpl_->axpy(alpha, x, y); } else { - out::error() <<"Not implemented (yet)" << std::endl; + if (memspace == "cpu") { + cpuImpl_->axpy(alpha, x, y); + } else { + out::error() <<"Not implemented (yet)" << std::endl; + } } } } @@ -139,7 +165,7 @@ namespace ReSolve { * @param[in] V Multivector containing the matrix, organized columnwise * @param[in] y Vector, k x 1 if N and n x 1 if T * @param[in,out] x Vector, n x 1 if N and k x 1 if T - * @param[in] memspace cpu or cuda (for now) + * @param[in] memspace cpu or cuda or hip (for now) * * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 * @@ -148,6 +174,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); + } else if (memspace == "hip") { + hipImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else if (memspace == "cpu") { cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else { @@ -162,7 +190,7 @@ namespace ReSolve { * @param[in] alpha vector size k x 1 * @param[in] x (multi)vector size size x k * @param[in,out] y vector size size x 1 (this is where the result is stored) - * @param[in] memspace string containg memspace (cpu or cuda) + * @param[in] memspace string containg memspace (cpu or cuda or hip) * * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() * @@ -172,6 +200,8 @@ namespace ReSolve { using namespace constants; if (memspace == "cuda") { cudaImpl_->massAxpy(size, alpha, k, x, y); + } else if (memspace == "hip") { + hipImpl_->massAxpy(size, alpha, k, x, y); } else if (memspace == "cpu") { cpuImpl_->massAxpy(size, alpha, k, x, y); } else { @@ -188,7 +218,7 @@ namespace ReSolve { * @param[in] k Number of vectors in V * @param[in] x Multivector; 2 vectors size n x 1 each * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated * @@ -197,6 +227,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->massDot2Vec(size, V, k, x, res); + } else if (memspace == "hip") { + hipImpl_->massDot2Vec(size, V, k, x, res); } else if (memspace == "cpu") { cpuImpl_->massDot2Vec(size, V, k, x, res); } else { diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp index c17d4688..02d426b5 100644 --- a/resolve/vector/VectorHandler.hpp +++ b/resolve/vector/VectorHandler.hpp @@ -10,6 +10,7 @@ namespace ReSolve class VectorHandlerImpl; class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; } @@ -19,6 +20,7 @@ namespace ReSolve { //namespace vector { VectorHandler(); VectorHandler(LinAlgWorkspaceCpu* new_workspace); VectorHandler(LinAlgWorkspaceCUDA* new_workspace); + VectorHandler(LinAlgWorkspaceHIP* new_workspace); ~VectorHandler(); //y = alpha x + y @@ -55,9 +57,11 @@ namespace ReSolve { //namespace vector { private: VectorHandlerImpl* cpuImpl_{nullptr}; VectorHandlerImpl* cudaImpl_{nullptr}; + VectorHandlerImpl* hipImpl_{nullptr}; bool isCpuEnabled_{false}; bool isCudaEnabled_{false}; + bool isHipEnabled_{false}; }; } //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp new file mode 100644 index 00000000..9f2927c7 --- /dev/null +++ b/resolve/vector/VectorHandlerHip.cpp @@ -0,0 +1,236 @@ +#include + +#include +#include +#include +#include +#include +#include "VectorHandlerHip.hpp" + +namespace ReSolve { + using out = io::Logger; + + /** + * @brief empty constructor that does absolutely nothing + */ + VectorHandlerHip::VectorHandlerHip() + { + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandlerHip:: VectorHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + /** + * @brief destructor + */ + VectorHandlerHip::~VectorHandlerHip() + { + //delete the workspace TODO + } + + /** + * @brief dot product of two vectors i.e, a = x^Ty + * + * @param[in] x The first vector + * @param[in] y The second vector + * @param[in] memspace String containg memspace (cpu or hip) + * + * @return dot product (real number) of _x_ and _y_ + */ + + real_type VectorHandlerHip::dot(vector::Vector* x, vector::Vector* y) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + double nrm = 0.0; + rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData("hip"), 1, y->getData("hip"), 1, &nrm); + if (st!=0) {printf("dot product crashed with code %d \n", st);} + return nrm; + } + + /** + * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant + * + * @param[in] alpha The constant + * @param[in,out] x The vector + * @param memspace string containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::scal(const real_type* alpha, vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData("hip"), 1); + if (st!=0) { + ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; + } + } + + /** + * @brief axpy i.e, y = alpha*x+y where alpha is a constant + * + * @param[in] alpha The constant + * @param[in] x The first vector + * @param[in,out] y The second vector (result is return in y) + * @param[in] memspace String containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) + { + //AXPY: y = alpha * x + y + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_daxpy(handle_rocblas, + x->getSize(), + alpha, + x->getData("hip"), + 1, + y->getData("hip"), + 1); + } + + /** + * @brief gemv computes matrix-vector product where both matrix and vectors are dense. + * i.e., x = beta*x + alpha*V*y + * + * @param[in] Transpose - yes (T) or no (N) + * @param[in] n Number of rows in (non-transposed) matrix + * @param[in] k Number of columns in (non-transposed) + * @param[in] alpha Constant real number + * @param[in] beta Constant real number + * @param[in] V Multivector containing the matrix, organized columnwise + * @param[in] y Vector, k x 1 if N and n x 1 if T + * @param[in,out] x Vector, n x 1 if N and k x 1 if T + * @param[in] memspace cpu or hip (for now) + * + * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 + * + */ + void VectorHandlerHip::gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + if (transpose == "T") { + + rocblas_dgemv(handle_rocblas, + rocblas_operation_transpose, + n, + k, + alpha, + V->getData("hip"), + n, + y->getData("hip"), + 1, + beta, + x->getData("hip"), + 1); + + } else { + rocblas_dgemv(handle_rocblas, + rocblas_operation_none, + n, + k, + alpha, + V->getData("hip"), + n, + y->getData("hip"), + 1, + beta, + x->getData("hip"), + 1); + } + } + + /** + * @brief mass (bulk) axpy i.e, y = y - x*alpha where alpha is a vector + * + * @param[in] size number of elements in y + * @param[in] alpha vector size k x 1 + * @param[in] x (multi)vector size size x k + * @param[in,out] y vector size size x 1 (this is where the result is stored) + * @param[in] memspace string containg memspace (cpu or hip) + * + * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() + * + */ + void VectorHandlerHip::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) + { + using namespace constants; + if (k < 200) { + mass_axpy(size, k, x->getData("hip"), y->getData("hip"),alpha->getData("hip")); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_none, + rocblas_operation_none, + size, // m + 1, // n + k, // k + &MINUSONE, // alpha + x->getData("hip"), // A + size, // lda + alpha->getData("hip"), // B + k, // ldb + &ONE, + y->getData("hip"), // c + size); // ldc + } + } + + /** + * @brief mass (bulk) dot product i.e, V^T x, where V is n x k dense multivector + * (a dense multivector consisting of k vectors size n) and x is k x 2 dense + * multivector (a multivector consisiting of two vectors size n each) + * + * @param[in] size Number of elements in a single vector in V + * @param[in] V Multivector; k vectors size n x 1 each + * @param[in] k Number of vectors in V + * @param[in] x Multivector; 2 vectors size n x 1 each + * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) + * @param[in] memspace String containg memspace (cpu or hip) + * + * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated + * + */ + void VectorHandlerHip::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) + { + using namespace constants; + + if (k < 200) { + mass_inner_product_two_vectors(size, k, x->getData("hip") , x->getData(1, "hip"), V->getData("hip"), res->getData("hip")); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_transpose, + rocblas_operation_none, + k + 1, //m + 2, //n + size, //k + &ONE, //alpha + V->getData("hip"), //A + size, //lda + x->getData("hip"), //B + size, //ldb + &ZERO, + res->getData("hip"), //c + k + 1); //ldc + } + } + +} // namespace ReSolve diff --git a/resolve/vector/VectorHandlerHip.hpp b/resolve/vector/VectorHandlerHip.hpp new file mode 100644 index 00000000..7e5085e3 --- /dev/null +++ b/resolve/vector/VectorHandlerHip.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class LinAlgWorkspaceHIP; + class VectorHandlerImpl; +} + + +namespace ReSolve { //namespace vector { + class VectorHandlerHip : public VectorHandlerImpl + { + public: + VectorHandlerHip(); + VectorHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~VectorHandlerHip(); + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y); + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y); + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x); + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y); + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res); + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x); + private: + LinAlgWorkspaceHIP* workspace_; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt index 673fac4b..a34c2191 100644 --- a/resolve/workspace/CMakeLists.txt +++ b/resolve/workspace/CMakeLists.txt @@ -16,10 +16,15 @@ set(ReSolve_Workspace_CUDASDK_SRC LinAlgWorkspaceCUDA.cpp ) +set(ReSolve_Workspace_ROCM_SRC + LinAlgWorkspaceHIP.cpp +) + set(ReSolve_Workspace_HEADER_INSTALL LinAlgWorkspace.hpp LinAlgWorkspaceCpu.hpp LinAlgWorkspaceCUDA.hpp + LinAlgWorkspaceHIP.hpp ) # If cuda is enabled, add CUDA SDK workspace files @@ -27,6 +32,10 @@ if(RESOLVE_USE_CUDA) set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_ROCM_SRC}) +endif() + add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC}) # If CUDA is enabled, link to ReSolve CUDA backend @@ -34,6 +43,10 @@ if(RESOLVE_USE_CUDA) target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip) +endif(RESOLVE_USE_HIP) + target_include_directories(resolve_workspace INTERFACE $ $ diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp index 6da58fda..4efe834e 100644 --- a/resolve/workspace/LinAlgWorkspace.hpp +++ b/resolve/workspace/LinAlgWorkspace.hpp @@ -6,3 +6,7 @@ #include #endif +#ifdef RESOLVE_USE_HIP +#include +#endif + diff --git a/resolve/workspace/LinAlgWorkspaceHIP.cpp b/resolve/workspace/LinAlgWorkspaceHIP.cpp new file mode 100644 index 00000000..e64dff17 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.cpp @@ -0,0 +1,75 @@ +#include + +namespace ReSolve +{ + LinAlgWorkspaceHIP::LinAlgWorkspaceHIP() + { + handle_rocsparse_ = nullptr; + handle_rocblas_ = nullptr; + + matvec_setup_done_ = false; + } + + LinAlgWorkspaceHIP::~LinAlgWorkspaceHIP() + { + rocsparse_destroy_handle(handle_rocsparse_); + rocblas_destroy_handle(handle_rocblas_); + rocsparse_destroy_mat_descr(mat_A_); + } + + rocsparse_handle LinAlgWorkspaceHIP::getRocsparseHandle() + { + return handle_rocsparse_; + } + + void LinAlgWorkspaceHIP::setRocsparseHandle(rocsparse_handle handle) + { + handle_rocsparse_ = handle; + } + + rocblas_handle LinAlgWorkspaceHIP::getRocblasHandle() + { + return handle_rocblas_; + } + + void LinAlgWorkspaceHIP::setRocblasHandle(rocblas_handle handle) + { + handle_rocblas_ = handle; + } + + rocsparse_mat_descr LinAlgWorkspaceHIP::getSpmvMatrixDescriptor() + { + return mat_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixDescriptor(rocsparse_mat_descr mat) + { + mat_A_ = mat; + } + + rocsparse_mat_info LinAlgWorkspaceHIP::getSpmvMatrixInfo() + { + return info_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixInfo(rocsparse_mat_info info) + { + info_A_ = info; + } + + bool LinAlgWorkspaceHIP::matvecSetup() + { + return matvec_setup_done_; + } + + void LinAlgWorkspaceHIP::matvecSetupDone() + { + matvec_setup_done_ = true; + } + + void LinAlgWorkspaceHIP::initializeHandles() + { + rocsparse_create_handle(&handle_rocsparse_); + rocblas_create_handle(&handle_rocblas_); + } + } // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp new file mode 100644 index 00000000..fbb55349 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include + +#include + +namespace ReSolve +{ + class LinAlgWorkspaceHIP + { + public: + LinAlgWorkspaceHIP(); + ~LinAlgWorkspaceHIP(); + + rocblas_handle getRocblasHandle(); + rocsparse_handle getRocsparseHandle(); + rocsparse_mat_descr getSpmvMatrixDescriptor(); + rocsparse_mat_info getSpmvMatrixInfo(); + + void setRocblasHandle(rocblas_handle handle); + void setRocsparseHandle(rocsparse_handle handle); + void setSpmvMatrixDescriptor(rocsparse_mat_descr mat); + void setSpmvMatrixInfo(rocsparse_mat_info info); + + void initializeHandles(); + + bool matvecSetup(); + void matvecSetupDone(); + + private: + //handles + rocblas_handle handle_rocblas_; + rocsparse_handle handle_rocsparse_; + + //matrix descriptors + rocsparse_mat_descr mat_A_; + + //vector descriptors not needed, rocsparse uses RAW pointers. + + //buffers + // there is no buffer needed in matvec + bool matvec_setup_done_; //check if setup is done for matvec (note: no buffer but there is analysis) + + //info - but we need info + rocsparse_mat_info info_A_; + + MemoryHandler mem_; + }; + +} // namespace ReSolve diff --git a/tests/unit/matrix/CMakeLists.txt b/tests/unit/matrix/CMakeLists.txt index 8906c2c6..8476f181 100644 --- a/tests/unit/matrix/CMakeLists.txt +++ b/tests/unit/matrix/CMakeLists.txt @@ -20,4 +20,4 @@ install(TARGETS ${installable_tests} RUNTIME DESTINATION bin/resolve/tests/unit) add_test(NAME matrix_test COMMAND $) -add_test(NAME matrix_handler_test COMMAND $) \ No newline at end of file +add_test(NAME matrix_handler_test COMMAND $) diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index e203017a..d7fe8449 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -49,6 +49,7 @@ class MatrixHandlerTests : TestBase vector::Vector x(N); vector::Vector y(N); x.allocate(memspace_); + if (x.getData(memspace_) == NULL) printf("oups we have an issue \n"); y.allocate(memspace_); x.setToConst(1.0, memspace_); @@ -80,6 +81,12 @@ class MatrixHandlerTests : TestBase LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new MatrixHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new MatrixHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -152,7 +159,7 @@ class MatrixHandlerTests : TestBase A->setUpdated("cpu"); // std::cout << rowptr[i] << "\n"; - if (memspace == "cuda") { + if ((memspace == "cuda") || (memspace == "hip")) { A->copyData(memspace); } diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index 6eee90d5..26ad70b0 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -33,5 +33,17 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::MatrixHandlerTests test("hip"); + + result += test.matrixHandlerConstructor(); + result += test.matrixOneNorm(); + result += test.matVec(50); + + std::cout << "\n"; + } +#endif return result.summary(); } diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp index d2f8c73c..60020ec5 100644 --- a/tests/unit/vector/VectorHandlerTests.hpp +++ b/tests/unit/vector/VectorHandlerTests.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -141,13 +142,13 @@ namespace ReSolve { } x->setToConst(ii, c, memspace_); } + index_type r = K % 2; real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5)); handler->massAxpy(N, alpha, K, x, y, memspace_); status *= verifyAnswer(y, 2.0 - res, memspace_); - - + delete handler; delete x; delete y; @@ -229,6 +230,12 @@ namespace ReSolve { LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new VectorHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new VectorHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -247,6 +254,7 @@ namespace ReSolve { for (index_type i = 0; i < x->getSize(); ++i) { // std::cout << x->getData("cpu")[i] << "\n"; if (!isEqual(x->getData("cpu")[i], answer)) { + std::cout << std::setprecision(16); status = false; std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i] << ", expected: " << answer << "\n"; diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp index 77e99471..9bb543a5 100644 --- a/tests/unit/vector/runVectorHandlerTests.cpp +++ b/tests/unit/vector/runVectorHandlerTests.cpp @@ -37,5 +37,22 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::VectorHandlerTests test("hip"); + + result += test.dot(5000); + result += test.axpy(5000); + result += test.scal(5000); + result += test.gemv(5000, 10); + result += test.massAxpy(100, 10); + result += test.massAxpy(1000, 300); + result += test.massDot(100, 10); + result += test.massDot(1000, 30); + + std::cout << "\n"; + } +#endif return result.summary(); }