First stab at hip linear algebra.

ORNL · Oct 28, 2023 · c0408b2 · c0408b2
1 parent 49145cc
commit c0408b2
Show file tree

Hide file tree

Showing 28 changed files with 1,077 additions and 99 deletions.
diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
@@ -9,6 +9,8 @@ find_package(hipblas REQUIRED)
 target_link_libraries(resolve_hip INTERFACE
   #hip::host 
   hip::device
+  rocblas
+  rocsparse
   #roc::hipblas
 )
 

diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt
@@ -7,14 +7,14 @@
 ]]
 
 set(ReSolve_HIP_SRC
-    # hipKernels.cu
+    hipKernels.hip
     hipVectorKernels.hip
     MemoryUtils.hip
 )
 
 set(ReSolve_HIP_HEADER_INSTALL
     # hipKernels.h
-    # hipVectorKernels.h
+    hipVectorKernels.h
     HipMemory.hpp
     # hip_check_errors.hpp
 )

diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h
@@ -0,0 +1,14 @@
+void mass_inner_product_two_vectors(int n, 
+                                    int i, 
+                                    double* vec1, 
+                                    double* vec2, 
+                                    double* mvec, 
+                                    double* result);
+void mass_axpy(int n, int i, double* x, double* y, double* alpha);
+
+//needed for matrix inf nrm
+void matrix_row_sums(int n, 
+                     int nnz, 
+                     int* a_ia,
+                     double* a_val, 
+                     double* result);
diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
@@ -0,0 +1,167 @@
+#include "hipKernels.h"
+#define maxk 1024
+#define Tv5 1024
+
+#include <hip/hip_runtime.h>
+
+//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
+__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, 
+                                    const double* __restrict__ u2, 
+                                    const double* __restrict__ v, 
+                                    double* result,
+                                    const int k, 
+                                    const int N)
+{
+  int t = threadIdx.x;
+  int bsize = blockDim.x;
+
+  // assume T threads per thread block (and k reductions to be performed)
+  volatile __shared__ double s_tmp1[Tv5];
+
+  volatile __shared__ double s_tmp2[Tv5];
+  // map between thread index space and the problem index space
+  int j = blockIdx.x;
+  s_tmp1[t] = 0.0f;
+  s_tmp2[t] = 0.0f;
+  int nn = t;
+  double can1, can2, cbn;
+
+  while(nn < N) {
+    can1 = u1[nn];
+    can2 = u2[nn];
+
+    cbn = v[N * j + nn];
+    s_tmp1[t] += can1 * cbn;
+    s_tmp2[t] += can2 * cbn;
+
+    nn += bsize;
+  }
+
+  __syncthreads();
+
+  if(Tv5 >= 1024) {
+    if(t < 512) {
+      s_tmp1[t] += s_tmp1[t + 512];
+      s_tmp2[t] += s_tmp2[t + 512];
+    }
+    __syncthreads();
+  }
+  if(Tv5 >= 512) {
+    if(t < 256) {
+      s_tmp1[t] += s_tmp1[t + 256];
+      s_tmp2[t] += s_tmp2[t + 256];
+    }
+    __syncthreads();
+  }
+  {
+    if(t < 128) {
+      s_tmp1[t] += s_tmp1[t + 128];
+      s_tmp2[t] += s_tmp2[t + 128];
+    }
+    __syncthreads();
+  }
+  {
+    if(t < 64) {
+      s_tmp1[t] += s_tmp1[t + 64];
+      s_tmp2[t] += s_tmp2[t + 64];
+    }
+    __syncthreads();
+  }
+
+  if(t < 32) {
+    s_tmp1[t] += s_tmp1[t + 32];
+    s_tmp2[t] += s_tmp2[t + 32];
+
+    s_tmp1[t] += s_tmp1[t + 16];
+    s_tmp2[t] += s_tmp2[t + 16];
+
+    s_tmp1[t] += s_tmp1[t + 8];
+    s_tmp2[t] += s_tmp2[t + 8];
+
+    s_tmp1[t] += s_tmp1[t + 4];
+    s_tmp2[t] += s_tmp2[t + 4];
+
+    s_tmp1[t] += s_tmp1[t + 2];
+    s_tmp2[t] += s_tmp2[t + 2];
+
+    s_tmp1[t] += s_tmp1[t + 1];
+    s_tmp2[t] += s_tmp2[t + 1];
+  }
+  if(t == 0) {
+    result[blockIdx.x] = s_tmp1[0];
+    result[blockIdx.x + k] = s_tmp2[0];
+  }
+}
+
+
+//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
+
+__global__ void massAxpy3_kernel(int N,
+                                 int k,
+                                 const double* x_data,
+                                 double* y_data,
+                                 const double* alpha) {
+
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  unsigned int t = threadIdx.x;
+
+  __shared__ double s_alpha[maxk];
+  if(t < k) {
+    s_alpha[t] = alpha[t];
+  }
+  __syncthreads();
+  while (i < N){
+    double temp = 0.0;
+    for(int j = 0; j < k; ++j) {
+      temp += x_data[j * N + i] * s_alpha[j];
+    }
+    y_data[i] -= temp;
+    i += (blockDim.x*gridDim.x);
+  }
+}
+__global__ void matrixInfNormPart1(const int n, 
+                                   const int nnz, 
+                                   const int* a_ia,
+                                   const double* a_val, 
+                                   double* result) {
+
+  // one thread per row, pass through rows
+  // and sum
+  // can be done through atomics
+  //\sum_{j=1}^m abs(a_{ij})
+
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  while (idx < n){
+    double sum = 0.0f;
+    for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
+      sum = sum + fabs(a_val[i]);
+    }
+    result[idx] = sum;
+    idx += (blockDim.x*gridDim.x);
+  }
+}
+
+
+void mass_inner_product_two_vectors(int n, 
+                                    int i, 
+                                    double* vec1, 
+                                    double* vec2, 
+                                    double* mvec, 
+                                    double* result)
+{
+  hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
+}
+void mass_axpy(int n, int i, double* x, double* y, double* alpha)
+{
+  hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
+}
+
+void matrix_row_sums(int n, 
+                     int nnz, 
+                     int* a_ia,
+                     double* a_val, 
+                     double* result)
+{
+  hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
+}
diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip
@@ -1,29 +1,28 @@
 #include <resolve/Common.hpp>
 #include <resolve/vector/VectorKernels.hpp>
-
-#include "hipVectorKernels.h"
+#include <hip/hip_runtime.h>
 
 namespace ReSolve { namespace vector {
 
 namespace kernels {
 
-__global__ void set_const(index_type n, real_type val, real_type* arr)
-{
-  index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  if(i < n)
+  __global__ void set_const(index_type n, real_type val, real_type* arr)
   {
-    arr[i] = val;
+    index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+    while (i < n)
+    {
+      arr[i] = val;
+      i += blockDim.x * gridDim.x;
+    }
   }
-}
-
 } // namespace kernels
 
-void set_array_const(index_type n, real_type val, real_type* arr)
+void set_array_const(index_type  n, real_type val, real_type* arr)
 {
-  index_type num_blocks;
-  index_type block_size = 512;
-  num_blocks = (n + block_size - 1) / block_size;
-  kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
+   index_type num_blocks;
+   index_type block_size = 512;
+   num_blocks = (n + block_size - 1) / block_size;
+   hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr);
 }
 
-}} // namespace ReSolve::vector
+}} // namespace ReSolve::vector
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
@@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC
     MatrixHandlerCuda.cpp
 )
 
+# and on HIP
+set(Matrix_ROCM_SRC 
+  MatrixHandlerHip.cpp
+)
+
 # Header files to be installed
 set(Matrix_HEADER_INSTALL
     io.hpp
@@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA)
   set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC})
 endif()
 
+if(RESOLVE_USE_HIP)
+  set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC})
+endif()
+
 
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
@@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA)
   target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
 endif()
 
+if (RESOLVE_USE_HIP)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip)
+endif()
+
 # Link to dummy device backend if GPU support is not enabled
 if (NOT RESOLVE_USE_GPU)
   target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)

diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp
@@ -33,8 +33,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_row_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_row_data_;
       } else {
         return nullptr;
@@ -48,8 +48,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_col_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_col_data_;
       } else {
         return nullptr;
@@ -63,8 +63,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_val_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_val_data_;
       } else {
         return nullptr;
@@ -81,9 +81,9 @@ namespace ReSolve
     setNotUpdated();
     int control=-1;
     if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
 
     if (memspaceOut == "cpu") {
       //check if cpu data allocated	
@@ -98,7 +98,7 @@ namespace ReSolve
       }
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
@@ -120,23 +120,23 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 2://cuda->cpu
+      case 2://gpu->cpu
         mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 1://cpu->cuda
+      case 1://cpu->gpu
         mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
-      case 3://cuda->cuda
+      case 3://gpu->gpua
         mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
@@ -176,7 +176,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspace == "cuda") {
+    if ((memspace == "cuda") || (memspace == "hip")) {
       mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -215,7 +215,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_row_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);