Skip to content

Commit

Permalink
First stab at hip linear algebra.
Browse files Browse the repository at this point in the history
  • Loading branch information
kswirydo authored and pelesh committed Oct 28, 2023
1 parent 49145cc commit c0408b2
Show file tree
Hide file tree
Showing 28 changed files with 1,077 additions and 99 deletions.
2 changes: 2 additions & 0 deletions cmake/ReSolveFindHipLibraries.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ find_package(hipblas REQUIRED)
target_link_libraries(resolve_hip INTERFACE
#hip::host
hip::device
rocblas
rocsparse
#roc::hipblas
)

Expand Down
4 changes: 2 additions & 2 deletions resolve/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@
]]

set(ReSolve_HIP_SRC
# hipKernels.cu
hipKernels.hip
hipVectorKernels.hip
MemoryUtils.hip
)

set(ReSolve_HIP_HEADER_INSTALL
# hipKernels.h
# hipVectorKernels.h
hipVectorKernels.h
HipMemory.hpp
# hip_check_errors.hpp
)
Expand Down
14 changes: 14 additions & 0 deletions resolve/hip/hipKernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
void mass_inner_product_two_vectors(int n,
int i,
double* vec1,
double* vec2,
double* mvec,
double* result);
void mass_axpy(int n, int i, double* x, double* y, double* alpha);

//needed for matrix inf nrm
void matrix_row_sums(int n,
int nnz,
int* a_ia,
double* a_val,
double* result);
167 changes: 167 additions & 0 deletions resolve/hip/hipKernels.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include "hipKernels.h"
#define maxk 1024
#define Tv5 1024

#include <hip/hip_runtime.h>

//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
const double* __restrict__ u2,
const double* __restrict__ v,
double* result,
const int k,
const int N)
{
int t = threadIdx.x;
int bsize = blockDim.x;

// assume T threads per thread block (and k reductions to be performed)
volatile __shared__ double s_tmp1[Tv5];

volatile __shared__ double s_tmp2[Tv5];
// map between thread index space and the problem index space
int j = blockIdx.x;
s_tmp1[t] = 0.0f;
s_tmp2[t] = 0.0f;
int nn = t;
double can1, can2, cbn;

while(nn < N) {
can1 = u1[nn];
can2 = u2[nn];

cbn = v[N * j + nn];
s_tmp1[t] += can1 * cbn;
s_tmp2[t] += can2 * cbn;

nn += bsize;
}

__syncthreads();

if(Tv5 >= 1024) {
if(t < 512) {
s_tmp1[t] += s_tmp1[t + 512];
s_tmp2[t] += s_tmp2[t + 512];
}
__syncthreads();
}
if(Tv5 >= 512) {
if(t < 256) {
s_tmp1[t] += s_tmp1[t + 256];
s_tmp2[t] += s_tmp2[t + 256];
}
__syncthreads();
}
{
if(t < 128) {
s_tmp1[t] += s_tmp1[t + 128];
s_tmp2[t] += s_tmp2[t + 128];
}
__syncthreads();
}
{
if(t < 64) {
s_tmp1[t] += s_tmp1[t + 64];
s_tmp2[t] += s_tmp2[t + 64];
}
__syncthreads();
}

if(t < 32) {
s_tmp1[t] += s_tmp1[t + 32];
s_tmp2[t] += s_tmp2[t + 32];

s_tmp1[t] += s_tmp1[t + 16];
s_tmp2[t] += s_tmp2[t + 16];

s_tmp1[t] += s_tmp1[t + 8];
s_tmp2[t] += s_tmp2[t + 8];

s_tmp1[t] += s_tmp1[t + 4];
s_tmp2[t] += s_tmp2[t + 4];

s_tmp1[t] += s_tmp1[t + 2];
s_tmp2[t] += s_tmp2[t + 2];

s_tmp1[t] += s_tmp1[t + 1];
s_tmp2[t] += s_tmp2[t + 1];
}
if(t == 0) {
result[blockIdx.x] = s_tmp1[0];
result[blockIdx.x + k] = s_tmp2[0];
}
}


//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES

__global__ void massAxpy3_kernel(int N,
int k,
const double* x_data,
double* y_data,
const double* alpha) {

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int t = threadIdx.x;

__shared__ double s_alpha[maxk];
if(t < k) {
s_alpha[t] = alpha[t];
}
__syncthreads();
while (i < N){
double temp = 0.0;
for(int j = 0; j < k; ++j) {
temp += x_data[j * N + i] * s_alpha[j];
}
y_data[i] -= temp;
i += (blockDim.x*gridDim.x);
}
}
__global__ void matrixInfNormPart1(const int n,
const int nnz,
const int* a_ia,
const double* a_val,
double* result) {

// one thread per row, pass through rows
// and sum
// can be done through atomics
//\sum_{j=1}^m abs(a_{ij})

int idx = blockIdx.x*blockDim.x + threadIdx.x;
while (idx < n){
double sum = 0.0f;
for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
sum = sum + fabs(a_val[i]);
}
result[idx] = sum;
idx += (blockDim.x*gridDim.x);
}
}


void mass_inner_product_two_vectors(int n,
int i,
double* vec1,
double* vec2,
double* mvec,
double* result)
{
hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
}
void mass_axpy(int n, int i, double* x, double* y, double* alpha)
{
hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
}

void matrix_row_sums(int n,
int nnz,
int* a_ia,
double* a_val,
double* result)
{
hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
}
29 changes: 14 additions & 15 deletions resolve/hip/hipVectorKernels.hip
Original file line number Diff line number Diff line change
@@ -1,29 +1,28 @@
#include <resolve/Common.hpp>
#include <resolve/vector/VectorKernels.hpp>

#include "hipVectorKernels.h"
#include <hip/hip_runtime.h>

namespace ReSolve { namespace vector {

namespace kernels {

__global__ void set_const(index_type n, real_type val, real_type* arr)
{
index_type i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < n)
__global__ void set_const(index_type n, real_type val, real_type* arr)
{
arr[i] = val;
index_type i = blockIdx.x * blockDim.x + threadIdx.x;
while (i < n)
{
arr[i] = val;
i += blockDim.x * gridDim.x;
}
}
}

} // namespace kernels

void set_array_const(index_type n, real_type val, real_type* arr)
void set_array_const(index_type n, real_type val, real_type* arr)
{
index_type num_blocks;
index_type block_size = 512;
num_blocks = (n + block_size - 1) / block_size;
kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
index_type num_blocks;
index_type block_size = 512;
num_blocks = (n + block_size - 1) / block_size;
hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr);
}

}} // namespace ReSolve::vector
}} // namespace ReSolve::vector
13 changes: 13 additions & 0 deletions resolve/matrix/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC
MatrixHandlerCuda.cpp
)

# and on HIP
set(Matrix_ROCM_SRC
MatrixHandlerHip.cpp
)

# Header files to be installed
set(Matrix_HEADER_INSTALL
io.hpp
Expand All @@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA)
set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC})
endif()

if(RESOLVE_USE_HIP)
set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC})
endif()


# Build shared library ReSolve::matrix
add_library(resolve_matrix SHARED ${Matrix_SRC})
Expand All @@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA)
target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
endif()

if (RESOLVE_USE_HIP)
target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip)
endif()

# Link to dummy device backend if GPU support is not enabled
if (NOT RESOLVE_USE_GPU)
target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
Expand Down
30 changes: 15 additions & 15 deletions resolve/matrix/Coo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ namespace ReSolve
copyData("cpu");
return this->h_row_data_;
} else {
if (memspace == "cuda") {
copyData("cuda");
if ((memspace == "cuda") || (memspace == "hip")) {
copyData(memspace);
return this->d_row_data_;
} else {
return nullptr;
Expand All @@ -48,8 +48,8 @@ namespace ReSolve
copyData("cpu");
return this->h_col_data_;
} else {
if (memspace == "cuda") {
copyData("cuda");
if ((memspace == "cuda") || (memspace == "hip")) {
copyData(memspace);
return this->d_col_data_;
} else {
return nullptr;
Expand All @@ -63,8 +63,8 @@ namespace ReSolve
copyData("cpu");
return this->h_val_data_;
} else {
if (memspace == "cuda") {
copyData("cuda");
if ((memspace == "cuda") || (memspace == "hip")) {
copyData(memspace);
return this->d_val_data_;
} else {
return nullptr;
Expand All @@ -81,9 +81,9 @@ namespace ReSolve
setNotUpdated();
int control=-1;
if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}

if (memspaceOut == "cpu") {
//check if cpu data allocated
Expand All @@ -98,7 +98,7 @@ namespace ReSolve
}
}

if (memspaceOut == "cuda") {
if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
//check if cuda data allocated
if (d_row_data_ == nullptr) {
mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
Expand All @@ -120,23 +120,23 @@ namespace ReSolve
owns_cpu_data_ = true;
owns_cpu_vals_ = true;
break;
case 2://cuda->cpu
case 2://gpu->cpu
mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
h_data_updated_ = true;
owns_cpu_data_ = true;
owns_cpu_vals_ = true;
break;
case 1://cpu->cuda
case 1://cpu->gpu
mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
d_data_updated_ = true;
owns_gpu_data_ = true;
owns_gpu_vals_ = true;
break;
case 3://cuda->cuda
case 3://gpu->gpua
mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
Expand Down Expand Up @@ -176,7 +176,7 @@ namespace ReSolve
return 0;
}

if (memspace == "cuda") {
if ((memspace == "cuda") || (memspace == "hip")) {
mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
mem_.allocateArrayOnDevice(&d_val_data_, nnz_current);
Expand Down Expand Up @@ -215,7 +215,7 @@ namespace ReSolve
return 0;
}

if (memspaceOut == "cuda") {
if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
if (d_row_data_ == nullptr) {
mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
Expand Down
Loading

0 comments on commit c0408b2

Please sign in to comment.