Fixing minor leftovers from Frontier hackathon (#69)

* Minor fixes: removing redundant calls and fixing code style. * Fix memory leak in GS class.
ORNL · Nov 8, 2023 · 128ad65 · 128ad65
1 parent fcc0d67
commit 128ad65
Show file tree

Hide file tree

Showing 4 changed files with 43 additions and 53 deletions.
diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp
@@ -36,32 +36,20 @@ namespace ReSolve
         delete h_L_;    
         delete h_rv_;    
 
-        vec_rv_->setData(nullptr, memory::DEVICE);
-        vec_rv_->setData(nullptr, memory::HOST);
-        vec_Hcolumn_->setData(nullptr, memory::DEVICE);
-        vec_Hcolumn_->setData(nullptr, memory::HOST);
-
-        delete [] vec_rv_;    
-        delete [] vec_Hcolumn_;;    
+        delete vec_rv_;    
+        delete vec_Hcolumn_;;    
       }
 
       if(variant_ == cgs2) {
         delete h_aux_;
-        vec_Hcolumn_->setData(nullptr, memory::DEVICE);
-        //        vec_Hcolumn_->setData(nullptr, memory::HOST);
-        delete [] vec_Hcolumn_;    
+        delete vec_Hcolumn_;    
       }    
       if(variant_ == mgs_pm) {
         delete h_aux_;
       }
 
-      vec_v_->setData(nullptr, memory::DEVICE);
-      vec_v_->setData(nullptr, memory::HOST);
-      vec_w_->setData(nullptr, memory::DEVICE);
-      vec_w_->setData(nullptr, memory::HOST);
-
-      delete [] vec_w_;
-      delete [] vec_v_;   
+      delete vec_w_;
+      delete vec_v_;   
     }
   }
 

diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
@@ -1,10 +1,9 @@
 #include "hipKernels.h"
-#define maxk 1024
-#define Tv5 1024
 
 #include <hip/hip_runtime.h>
 
 //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
+template <size_t Tv5 = 1024>
 __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, 
                                     const double* __restrict__ u2, 
                                     const double* __restrict__ v, 
@@ -21,12 +20,12 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
   volatile __shared__ double s_tmp2[Tv5];
   // map between thread index space and the problem index space
   int j = blockIdx.x;
-  s_tmp1[t] = 0.0f;
-  s_tmp2[t] = 0.0f;
+  s_tmp1[t] = 0.0;
+  s_tmp2[t] = 0.0;
   int nn = t;
   double can1, can2, cbn;
 
-  while(nn < N) {
+  while (nn < N) {
     can1 = u1[nn];
     can2 = u2[nn];
 
@@ -39,36 +38,36 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
 
   __syncthreads();
 
-  if(Tv5 >= 1024) {
+  if (Tv5 >= 1024) {
     if(t < 512) {
       s_tmp1[t] += s_tmp1[t + 512];
       s_tmp2[t] += s_tmp2[t + 512];
     }
     __syncthreads();
   }
-  if(Tv5 >= 512) {
+  if (Tv5 >= 512) {
     if(t < 256) {
       s_tmp1[t] += s_tmp1[t + 256];
       s_tmp2[t] += s_tmp2[t + 256];
     }
     __syncthreads();
   }
   {
-    if(t < 128) {
+    if (t < 128) {
       s_tmp1[t] += s_tmp1[t + 128];
       s_tmp2[t] += s_tmp2[t + 128];
     }
     __syncthreads();
   }
   {
-    if(t < 64) {
+    if (t < 64) {
       s_tmp1[t] += s_tmp1[t + 64];
       s_tmp2[t] += s_tmp2[t + 64];
     }
     __syncthreads();
   }
 
-  if(t < 32) {
+  if (t < 32) {
     s_tmp1[t] += s_tmp1[t + 32];
     s_tmp2[t] += s_tmp2[t + 32];
 
@@ -87,31 +86,32 @@ __global__ void MassIPTwoVec_kernel(const double* __restrict__ u1,
     s_tmp1[t] += s_tmp1[t + 1];
     s_tmp2[t] += s_tmp2[t + 1];
   }
-  if(t == 0) {
+  if (t == 0) {
     result[blockIdx.x] = s_tmp1[0];
     result[blockIdx.x + k] = s_tmp2[0];
   }
 }
 
 
-//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
-
+/// mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
+template <size_t Tmaxk = 1024>
 __global__ void massAxpy3_kernel(int N,
                                  int k,
                                  const double* x_data,
                                  double* y_data,
-                                 const double* alpha) {
+                                 const double* alpha)
+{
 
   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
 
   unsigned int t = threadIdx.x;
 
-  __shared__ double s_alpha[maxk];
-  if(t < k) {
+  __shared__ double s_alpha[Tmaxk];
+  if (t < k) {
     s_alpha[t] = alpha[t];
   }
   __syncthreads();
-  while (i < N){
+  while (i < N) {
     double temp = 0.0;
     for(int j = 0; j < k; ++j) {
       temp += x_data[j * N + i] * s_alpha[j];
@@ -124,16 +124,16 @@ __global__ void matrixInfNormPart1(const int n,
                                    const int nnz, 
                                    const int* a_ia,
                                    const double* a_val, 
-                                   double* result) {
-
+                                   double* result)
+{
   // one thread per row, pass through rows
   // and sum
   // can be done through atomics
   //\sum_{j=1}^m abs(a_{ij})
 
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx < n){
-    double sum = 0.0f;
+  while (idx < n) {
+    double sum = 0.0;
     for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
       sum = sum + fabs(a_val[i]);
     }
@@ -146,12 +146,11 @@ __global__ void matrixInfNormPart1(const int n,
 __global__ void permuteVectorP_kernel(const int n, 
                                       const int* perm_vector,
                                       const double* vec_in, 
-                                      double* vec_out){
-
+                                      double* vec_out)
+{
   //one thread per vector entry, pass through rows
-
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx<n){
+  while (idx<n) {
     vec_out[idx] = vec_in[perm_vector[idx]];
     idx+= (blockDim.x*gridDim.x);
   }
@@ -160,12 +159,11 @@ __global__ void permuteVectorP_kernel(const int n,
 __global__ void permuteVectorQ_kernel(const int n, 
                                       const int* perm_vector,
                                       const double* vec_in, 
-                                      double* vec_out){
-
+                                      double* vec_out)
+{
   //one thread per vector entry, pass through rows
-
   int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx<n){
+  while (idx<n) {
     vec_out[perm_vector[idx]] = vec_in[idx];
     idx+= (blockDim.x*gridDim.x);
   }
@@ -180,6 +178,7 @@ void mass_inner_product_two_vectors(int n,
 {
   hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
 }
+
 void mass_axpy(int n, int i, double* x, double* y, double* alpha)
 {
   hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
@@ -191,21 +190,21 @@ void matrix_row_sums(int n,
                      double* a_val, 
                      double* result)
 {
-  hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
+  hipLaunchKernelGGL(matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
 }
 
 void permuteVectorP(int n, 
                     int* perm_vector,
                     double* vec_in, 
                     double* vec_out)
 {
-  hipLaunchKernelGGL(permuteVectorP_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
+  hipLaunchKernelGGL(permuteVectorP_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
 }
 
 void permuteVectorQ(int n, 
                     int* perm_vector,
                     double* vec_in, 
                     double* vec_out)
 {
-  hipLaunchKernelGGL(permuteVectorQ_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
+  hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
 }
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
@@ -42,7 +42,7 @@ namespace ReSolve {
    */
   MatrixHandler::~MatrixHandler()
   {
-    if (isCpuEnabled_)  delete cpuImpl_;
+    delete cpuImpl_;
     if (isCudaEnabled_) delete cudaImpl_;
     if (isHipEnabled_) delete hipImpl_;
   }

diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
@@ -61,11 +61,11 @@ namespace ReSolve {
    */
   VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace)
   {
-    hipImpl_ = new VectorHandlerHip(new_workspace);
-    cpuImpl_  = new  VectorHandlerCpu();
+    hipImpl_  = new VectorHandlerHip(new_workspace);
+    cpuImpl_  = new VectorHandlerCpu();
 
     isHipEnabled_ = true;
-    isCpuEnabled_  = true;
+    isCpuEnabled_ = true;
   }
 #endif
 
@@ -74,6 +74,9 @@ namespace ReSolve {
    */
   VectorHandler::~VectorHandler()
   {
+    delete cpuImpl_;
+    if (isCudaEnabled_) delete cudaImpl_;
+    if (isHipEnabled_)  delete hipImpl_;
     //delete the workspace TODO
   }