Skip to content

Commit

Permalink
Fixing minor leftovers from Frontier hackathon (#69)
Browse files Browse the repository at this point in the history
* Minor fixes: removing redundant calls and fixing code style.

* Fix memory leak in GS class.
  • Loading branch information
pelesh authored Nov 8, 2023
1 parent fcc0d67 commit 128ad65
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 53 deletions.
22 changes: 5 additions & 17 deletions resolve/GramSchmidt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,20 @@ namespace ReSolve
delete h_L_;
delete h_rv_;

vec_rv_->setData(nullptr, memory::DEVICE);
vec_rv_->setData(nullptr, memory::HOST);
vec_Hcolumn_->setData(nullptr, memory::DEVICE);
vec_Hcolumn_->setData(nullptr, memory::HOST);

delete [] vec_rv_;
delete [] vec_Hcolumn_;;
delete vec_rv_;
delete vec_Hcolumn_;;
}

if(variant_ == cgs2) {
delete h_aux_;
vec_Hcolumn_->setData(nullptr, memory::DEVICE);
// vec_Hcolumn_->setData(nullptr, memory::HOST);
delete [] vec_Hcolumn_;
delete vec_Hcolumn_;
}
if(variant_ == mgs_pm) {
delete h_aux_;
}

vec_v_->setData(nullptr, memory::DEVICE);
vec_v_->setData(nullptr, memory::HOST);
vec_w_->setData(nullptr, memory::DEVICE);
vec_w_->setData(nullptr, memory::HOST);

delete [] vec_w_;
delete [] vec_v_;
delete vec_w_;
delete vec_v_;
}
}

Expand Down
63 changes: 31 additions & 32 deletions resolve/hip/hipKernels.hip
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
#include "hipKernels.h"
#define maxk 1024
#define Tv5 1024

#include <hip/hip_runtime.h>

//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
template <size_t Tv5 = 1024>
__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
const double* __restrict__ u2,
const double* __restrict__ v,
Expand All @@ -21,12 +20,12 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
volatile __shared__ double s_tmp2[Tv5];
// map between thread index space and the problem index space
int j = blockIdx.x;
s_tmp1[t] = 0.0f;
s_tmp2[t] = 0.0f;
s_tmp1[t] = 0.0;
s_tmp2[t] = 0.0;
int nn = t;
double can1, can2, cbn;

while(nn < N) {
while (nn < N) {
can1 = u1[nn];
can2 = u2[nn];

Expand All @@ -39,36 +38,36 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,

__syncthreads();

if(Tv5 >= 1024) {
if (Tv5 >= 1024) {
if(t < 512) {
s_tmp1[t] += s_tmp1[t + 512];
s_tmp2[t] += s_tmp2[t + 512];
}
__syncthreads();
}
if(Tv5 >= 512) {
if (Tv5 >= 512) {
if(t < 256) {
s_tmp1[t] += s_tmp1[t + 256];
s_tmp2[t] += s_tmp2[t + 256];
}
__syncthreads();
}
{
if(t < 128) {
if (t < 128) {
s_tmp1[t] += s_tmp1[t + 128];
s_tmp2[t] += s_tmp2[t + 128];
}
__syncthreads();
}
{
if(t < 64) {
if (t < 64) {
s_tmp1[t] += s_tmp1[t + 64];
s_tmp2[t] += s_tmp2[t + 64];
}
__syncthreads();
}

if(t < 32) {
if (t < 32) {
s_tmp1[t] += s_tmp1[t + 32];
s_tmp2[t] += s_tmp2[t + 32];

Expand All @@ -87,31 +86,32 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
s_tmp1[t] += s_tmp1[t + 1];
s_tmp2[t] += s_tmp2[t + 1];
}
if(t == 0) {
if (t == 0) {
result[blockIdx.x] = s_tmp1[0];
result[blockIdx.x + k] = s_tmp2[0];
}
}


//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES

/// mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
template <size_t Tmaxk = 1024>
__global__ void massAxpy3_kernel(int N,
int k,
const double* x_data,
double* y_data,
const double* alpha) {
const double* alpha)
{

unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

unsigned int t = threadIdx.x;

__shared__ double s_alpha[maxk];
if(t < k) {
__shared__ double s_alpha[Tmaxk];
if (t < k) {
s_alpha[t] = alpha[t];
}
__syncthreads();
while (i < N){
while (i < N) {
double temp = 0.0;
for(int j = 0; j < k; ++j) {
temp += x_data[j * N + i] * s_alpha[j];
Expand All @@ -124,16 +124,16 @@ __global__ void matrixInfNormPart1(const int n,
const int nnz,
const int* a_ia,
const double* a_val,
double* result) {

double* result)
{
// one thread per row, pass through rows
// and sum
// can be done through atomics
//\sum_{j=1}^m abs(a_{ij})

int idx = blockIdx.x*blockDim.x + threadIdx.x;
while (idx < n){
double sum = 0.0f;
while (idx < n) {
double sum = 0.0;
for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
sum = sum + fabs(a_val[i]);
}
Expand All @@ -146,12 +146,11 @@ __global__ void matrixInfNormPart1(const int n,
__global__ void permuteVectorP_kernel(const int n,
const int* perm_vector,
const double* vec_in,
double* vec_out){

double* vec_out)
{
//one thread per vector entry, pass through rows

int idx = blockIdx.x*blockDim.x + threadIdx.x;
while (idx<n){
while (idx<n) {
vec_out[idx] = vec_in[perm_vector[idx]];
idx+= (blockDim.x*gridDim.x);
}
Expand All @@ -160,12 +159,11 @@ __global__ void permuteVectorP_kernel(const int n,
__global__ void permuteVectorQ_kernel(const int n,
const int* perm_vector,
const double* vec_in,
double* vec_out){

double* vec_out)
{
//one thread per vector entry, pass through rows

int idx = blockIdx.x*blockDim.x + threadIdx.x;
while (idx<n){
while (idx<n) {
vec_out[perm_vector[idx]] = vec_in[idx];
idx+= (blockDim.x*gridDim.x);
}
Expand All @@ -180,6 +178,7 @@ void mass_inner_product_two_vectors(int n,
{
hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
}

void mass_axpy(int n, int i, double* x, double* y, double* alpha)
{
hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
Expand All @@ -191,21 +190,21 @@ void matrix_row_sums(int n,
double* a_val,
double* result)
{
hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
hipLaunchKernelGGL(matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
}

void permuteVectorP(int n,
int* perm_vector,
double* vec_in,
double* vec_out)
{
hipLaunchKernelGGL(permuteVectorP_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
hipLaunchKernelGGL(permuteVectorP_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
}

void permuteVectorQ(int n,
int* perm_vector,
double* vec_in,
double* vec_out)
{
hipLaunchKernelGGL(permuteVectorQ_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
}
2 changes: 1 addition & 1 deletion resolve/matrix/MatrixHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace ReSolve {
*/
MatrixHandler::~MatrixHandler()
{
if (isCpuEnabled_) delete cpuImpl_;
delete cpuImpl_;
if (isCudaEnabled_) delete cudaImpl_;
if (isHipEnabled_) delete hipImpl_;
}
Expand Down
9 changes: 6 additions & 3 deletions resolve/vector/VectorHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ namespace ReSolve {
*/
VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace)
{
hipImpl_ = new VectorHandlerHip(new_workspace);
cpuImpl_ = new VectorHandlerCpu();
hipImpl_ = new VectorHandlerHip(new_workspace);
cpuImpl_ = new VectorHandlerCpu();

isHipEnabled_ = true;
isCpuEnabled_ = true;
isCpuEnabled_ = true;
}
#endif

Expand All @@ -74,6 +74,9 @@ namespace ReSolve {
*/
VectorHandler::~VectorHandler()
{
delete cpuImpl_;
if (isCudaEnabled_) delete cudaImpl_;
if (isHipEnabled_) delete hipImpl_;
//delete the workspace TODO
}

Expand Down

0 comments on commit 128ad65

Please sign in to comment.