diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
index 225ea3c6..a22fb2f1 100644
--- a/resolve/cuda/CMakeLists.txt
+++ b/resolve/cuda/CMakeLists.txt
@@ -8,13 +8,12 @@
 
 set(ReSolve_CUDA_SRC
     cudaKernels.cu
-    cudaVectorKernels.cu
+    VectorKernels.cu
     MemoryUtils.cu
 )
 
 set(ReSolve_CUDA_HEADER_INSTALL
     cudaKernels.h
-    cudaVectorKernels.h
     CudaMemory.hpp
     cuda_check_errors.hpp
 )
diff --git a/resolve/cuda/VectorKernels.cu b/resolve/cuda/VectorKernels.cu
new file mode 100644
index 00000000..c10a4a7a
--- /dev/null
+++ b/resolve/cuda/VectorKernels.cu
@@ -0,0 +1,47 @@
+/**
+ * @file cudaVectorKernels.cu
+ * @author Slaven Peles (peless@ornl.gov)
+ * @brief Contains implementation of CUDA vector kernels and their wrappers.
+ * @date 2023-12-08
+ * 
+ * @note Kernel wrappers implemented here are intended for use in hardware
+ * agnostic code.
+ */
+
+#include <resolve/vector/VectorKernels.hpp>
+
+
+namespace ReSolve { namespace vector {
+
+  namespace kernels {
+
+    /**
+     * @brief CUDA kernel that sets values of an array to a constant.
+     *
+     * @param[in]  n   - length of the array
+     * @param[in]  val - the value the array is set to
+     * @param[out] arr - a pointer to the array
+     * 
+     * @pre  `arr` is allocated to size `n`
+     * @post `arr` elements are set to `val`
+     */
+    __global__ void set_const(index_type n, real_type val, real_type* arr)
+    {
+      index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+      if(i < n)
+      {
+        arr[i] = val;
+      }
+    }
+
+  } // namespace kernels
+
+  void set_array_const(index_type n, real_type val, real_type* arr)
+  {
+    index_type num_blocks;
+    index_type block_size = 512;
+    num_blocks = (n + block_size - 1) / block_size;
+    kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
+  }
+
+}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/cuda/cudaKernels.cu b/resolve/cuda/cudaKernels.cu
index 4773464c..fe88bb91 100644
--- a/resolve/cuda/cudaKernels.cu
+++ b/resolve/cuda/cudaKernels.cu
@@ -1,372 +1,528 @@
+/**
+ * @file cudaKernels.cu
+ * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
+ * @brief 
+ * @date 2023-12-08
+ * 
+ * 
+ */
+
 #include "cudaKernels.h"
 #include <cooperative_groups.h>
-#define maxk 1024
-#define Tv5 1024
-//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
-__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, 
-                                    const double* __restrict__ u2, 
-                                    const double* __restrict__ v, 
-                                    double* result,
-                                    const int k, 
-                                    const int N)
-{
-  int t = threadIdx.x;
-  int bsize = blockDim.x;
-
-  // assume T threads per thread block (and k reductions to be performed)
-  volatile __shared__ double s_tmp1[Tv5];
-
-  volatile __shared__ double s_tmp2[Tv5];
-  // map between thread index space and the problem index space
-  int j = blockIdx.x;
-  s_tmp1[t] = 0.0f;
-  s_tmp2[t] = 0.0f;
-  int nn = t;
-  double can1, can2, cbn;
-
-  while(nn < N) {
-    can1 = u1[nn];
-    can2 = u2[nn];
-
-    cbn = v[N * j + nn];
-    s_tmp1[t] += can1 * cbn;
-    s_tmp2[t] += can2 * cbn;
-
-    nn += bsize;
-  }
 
-  __syncthreads();
 
-  if(Tv5 >= 1024) {
-    if(t < 512) {
-      s_tmp1[t] += s_tmp1[t + 512];
-      s_tmp2[t] += s_tmp2[t + 512];
-    }
-    __syncthreads();
-  }
-  if(Tv5 >= 512) {
-    if(t < 256) {
-      s_tmp1[t] += s_tmp1[t + 256];
-      s_tmp2[t] += s_tmp2[t + 256];
+namespace ReSolve {
+  namespace kernels {
+
+    /**
+     * @brief Computes v^T * [u1 u2] where v is n x k multivector
+     * and u1 and u2 are n x 1 vectors.
+     * 
+     * @tparam Tv5 - Size of shared memory
+     *  
+     * @param[in] u1      - (n x 1) vector
+     * @param[in] u2      - (n x 1) vector
+     * @param[in] v       - (n x k) multivector
+     * @param[out] result - (k x 2) multivector
+     * @param[in] k       - dimension of the subspace
+     * @param[in] N       - size of vectors u1, u2
+     */
+    template <size_t Tv5 = 1024> 
+    __global__ void MassIPTwoVec(const real_type* __restrict__ u1, 
+                                 const real_type* __restrict__ u2, 
+                                 const real_type* __restrict__ v, 
+                                 real_type* result,
+                                 const index_type k, 
+                                 const index_type N)
+    {
+      index_type t = threadIdx.x;
+      index_type bsize = blockDim.x;
+
+      // assume T threads per thread block (and k reductions to be performed)
+      volatile __shared__ real_type s_tmp1[Tv5];
+      volatile __shared__ real_type s_tmp2[Tv5];
+
+      // map between thread index space and the problem index space
+      index_type j = blockIdx.x;
+      s_tmp1[t] = 0.0;
+      s_tmp2[t] = 0.0;
+      index_type nn = t;
+      real_type can1, can2, cbn;
+
+      while(nn < N) {
+        can1 = u1[nn];
+        can2 = u2[nn];
+
+        cbn = v[N * j + nn];
+        s_tmp1[t] += can1 * cbn;
+        s_tmp2[t] += can2 * cbn;
+
+        nn += bsize;
+      }
+
+      __syncthreads();
+
+      if(Tv5 >= 1024) {
+        if(t < 512) {
+          s_tmp1[t] += s_tmp1[t + 512];
+          s_tmp2[t] += s_tmp2[t + 512];
+        }
+        __syncthreads();
+      }
+      if(Tv5 >= 512) {
+        if(t < 256) {
+          s_tmp1[t] += s_tmp1[t + 256];
+          s_tmp2[t] += s_tmp2[t + 256];
+        }
+        __syncthreads();
+      }
+      {
+        if(t < 128) {
+          s_tmp1[t] += s_tmp1[t + 128];
+          s_tmp2[t] += s_tmp2[t + 128];
+        }
+        __syncthreads();
+      }
+      {
+        if(t < 64) {
+          s_tmp1[t] += s_tmp1[t + 64];
+          s_tmp2[t] += s_tmp2[t + 64];
+        }
+        __syncthreads();
+      }
+
+      if(t < 32) {
+        s_tmp1[t] += s_tmp1[t + 32];
+        s_tmp2[t] += s_tmp2[t + 32];
+
+        s_tmp1[t] += s_tmp1[t + 16];
+        s_tmp2[t] += s_tmp2[t + 16];
+
+        s_tmp1[t] += s_tmp1[t + 8];
+        s_tmp2[t] += s_tmp2[t + 8];
+
+        s_tmp1[t] += s_tmp1[t + 4];
+        s_tmp2[t] += s_tmp2[t + 4];
+
+        s_tmp1[t] += s_tmp1[t + 2];
+        s_tmp2[t] += s_tmp2[t + 2];
+
+        s_tmp1[t] += s_tmp1[t + 1];
+        s_tmp2[t] += s_tmp2[t + 1];
+      }
+      if(t == 0) {
+        result[blockIdx.x] = s_tmp1[0];
+        result[blockIdx.x + k] = s_tmp2[0];
+      }
     }
-    __syncthreads();
-  }
-  {
-    if(t < 128) {
-      s_tmp1[t] += s_tmp1[t + 128];
-      s_tmp2[t] += s_tmp2[t + 128];
-    }
-    __syncthreads();
-  }
-  {
-    if(t < 64) {
-      s_tmp1[t] += s_tmp1[t + 64];
-      s_tmp2[t] += s_tmp2[t + 64];
-    }
-    __syncthreads();
-  }
 
-  if(t < 32) {
-    s_tmp1[t] += s_tmp1[t + 32];
-    s_tmp2[t] += s_tmp2[t + 32];
 
-    s_tmp1[t] += s_tmp1[t + 16];
-    s_tmp2[t] += s_tmp2[t + 16];
+    /**
+     * @brief AXPY y = y - x*alpha where alpha is [k x 1], and x is [N x k] needed in 1 and 2 synch GMRES
+     * 
+     * @tparam Tmaxk 
+     * 
+     * @param[in]  N      - number of rows in x
+     * @param[in]  k      - number of columns in x
+     * @param[in]  x_data - double array, size [N x k]
+     * @param[out] y_data - double array, size [N x 1]
+     * @param[in]  alpha  - doble array, size [k x 1]
+     */
+    template <size_t Tmaxk = 1024> 
+    __global__ void massAxpy3(index_type N,
+                              index_type k,
+                              const real_type* x_data,
+                              real_type* y_data,
+                              const real_type* alpha)
+    {
+      index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+      index_type t = threadIdx.x;
+      __shared__ real_type s_alpha[Tmaxk];
+
+      if(t < k) {
+        s_alpha[t] = alpha[t];
+      }
+      __syncthreads();
+
+      if(i < N) {
+        real_type temp = 0.0;
+        for(index_type j = 0; j < k; ++j) {
+          temp += x_data[j * N + i] * s_alpha[j];
+        }
+        y_data[i] -= temp;
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 8];
-    s_tmp2[t] += s_tmp2[t + 8];
+    /**
+     * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij})
+     * 
+     * @param[in]  n      - number of rows in the matrix.
+     * @param[in]  nnz    - number of non-zero values in the matrix
+     * @param[in]  a_ia   - row pointers (CSR storage)
+     * @param[in]  a_val  - values (CSR storage)
+     * @param[out] result - array size [n x 1] containing sums of values in each row.
+     */
+    __global__ void matrixInfNormPart1(const index_type n, 
+                                       const index_type nnz, 
+                                       const index_type* a_ia,
+                                       const real_type* a_val, 
+                                       real_type* result)
+    {
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x;
+      while (idx < n) {
+        real_type sum = 0.0;
+        for (index_type i = a_ia[idx]; i < a_ia[idx + 1]; ++i) {
+          sum = sum + fabs(a_val[i]);
+        }
+        result[idx] = sum;
+        idx += (blockDim.x * gridDim.x);
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 4];
-    s_tmp2[t] += s_tmp2[t + 4];
+    /**
+     * @brief For count sketch in sketching random method
+     * 
+     * @param[in]  n      - number of entries in input vector
+     * @param[in]  k      - number of entries in output vector (k < n)
+     * @param[in]  labels - array size [n x 1] containing integers from 0 to k-1, assigned randomly.
+     * @param[in]  flip   - array size [n x 1] containing values `1` and `-1`
+     * @param[in]  input  - input vector, size [n x 1] 
+     * @param[out] output - output vector, size [k x 1] (this vector must be allocated and initialized with `0`s prior to calling the kernel).
+     */
+    __global__ void  count_sketch(const index_type n,
+                                  const index_type k, 
+                                  const index_type* labels,
+                                  const index_type* flip,
+                                  const real_type* input,
+                                  real_type* output)
+    {
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+      while (idx < n) {
+        real_type val = input[idx];
+        if (flip[idx] != 1) {
+          val *= -1.0;
+        }
+        atomicAdd(&output[labels[idx]], val);
+        idx += blockDim.x * gridDim.x;
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 2];
-    s_tmp2[t] += s_tmp2[t + 2];
+    /**
+     * @brief Walsh-Hadamard transform (select)
+     * 
+     * @param[in]  k      -
+     * @param[in]  perm   -
+     * @param[in]  input  -
+     * @param[out] output - 
+     */
+    __global__ void select(const index_type k, 
+                           const index_type* perm,
+                           const real_type* input,
+                           real_type* output){
+
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+      while (idx < k) {
+        output[idx] = input[perm[idx]];
+        idx += blockDim.x * gridDim.x;
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 1];
-    s_tmp2[t] += s_tmp2[t + 1];
-  }
-  if(t == 0) {
-    result[blockIdx.x] = s_tmp1[0];
-    result[blockIdx.x + k] = s_tmp2[0];
-  }
-}
+    /**
+     * @brief Walsh-Hadamard transform (scale)
+     * 
+     * @param[in]  n -
+     * @param[in]  D -
+     * @param[in]  x -
+     * @param[out] y -
+     */
+    __global__ void scaleByD(const index_type n,
+                             const index_type* D,
+                             const real_type* x,
+                             real_type* y)
+    {
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+      while (idx < n) {
+
+        if (D[idx] == 1) {
+          y[idx] = x[idx];
+        } else {
+          y[idx] = (-1.0) * x[idx];
+        }
+
+        idx += blockDim.x * gridDim.x;
+      }
+    }
 
+    /**
+     * @brief Single in-global memory radix-4 Fast Walsh Transform pass
+     * (for strides exceeding elementary vector size).
+     * 
+     * @param d_Output - 
+     * @param d_Input  -
+     * @param stride   -
+     */
+    __global__ void fwtBatch2Kernel(real_type* d_Output, real_type* d_Input, index_type stride) 
+    {
+      const index_type pos = blockIdx.x * blockDim.x + threadIdx.x;
+      const index_type N = blockDim.x * gridDim.x * 4;
+
+      real_type* d_Src = d_Input + blockIdx.y * N;
+      real_type* d_Dst = d_Output + blockIdx.y * N;
+
+      index_type lo = pos & (stride - 1);
+      index_type i0 = ((pos - lo) << 2) + lo;
+      index_type i1 = i0 + stride;
+      index_type i2 = i1 + stride;
+      index_type i3 = i2 + stride;
+
+      real_type D0 = d_Src[i0];
+      real_type D1 = d_Src[i1];
+      real_type D2 = d_Src[i2];
+      real_type D3 = d_Src[i3];
+
+      real_type T;
+      T = D0;
+      D0 = D0 + D2;
+      D2 = T - D2;
+      T = D1;
+      D1 = D1 + D3;
+      D3 = T - D3;
+      T = D0;
+      d_Dst[i0] = D0 + D1;
+      d_Dst[i1] = T - D1;
+      T = D2;
+      d_Dst[i2] = D2 + D3;
+      d_Dst[i3] = T - D3;
+    }
 
-//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
 
-__global__ void massAxpy3_kernel(int N,
-                                 int k,
-                                 const double* x_data,
-                                 double* y_data,
-                                 const double* alpha) {
+    /**
+     * @brief 
+     * 
+     * @param d_Output -
+     * @param d_Input  -
+     * @param log2N    -
+     * 
+     * @todo `d_Input` should be `const` parameter.
+     * 
+     */
+    __global__ void fwtBatch1Kernel(real_type* d_Output, real_type* d_Input, index_type log2N) 
+    {
+      // Handle to thread block group
+
+      cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
+      const index_type N = 1 << log2N;
+      const index_type base = blockIdx.x << log2N;
+
+      //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
+      extern __shared__ real_type s_data[];
+      real_type* d_Src = d_Input + base;
+      real_type* d_Dst = d_Output + base;
+
+      for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) {
+        s_data[pos] = d_Src[pos];
+      }
+
+      // Main radix-4 stages
+      const index_type pos = threadIdx.x;
+
+      for (index_type stride = N >> 2; stride > 0; stride >>= 2) {
+        index_type lo = pos & (stride - 1);
+        index_type i0 = ((pos - lo) << 2) + lo;
+        index_type i1 = i0 + stride;
+        index_type i2 = i1 + stride;
+        index_type i3 = i2 + stride;
+
+        cooperative_groups::sync(cta);
+        real_type D0 = s_data[i0];
+        real_type D1 = s_data[i1];
+        real_type D2 = s_data[i2];
+        real_type D3 = s_data[i3];
+
+        real_type T;
+        T = D0;
+        D0 = D0 + D2;
+        D2 = T - D2;
+        T = D1;
+        D1 = D1 + D3;
+        D3 = T - D3;
+        T = D0;
+        s_data[i0] = D0 + D1;
+        s_data[i1] = T - D1;
+        T = D2;
+        s_data[i2] = D2 + D3;
+        s_data[i3] = T - D3;
+      }
+
+      // Do single radix-2 stage for odd power of two
+      if (log2N & 1) {
+
+        cooperative_groups::sync(cta);
+
+        for (index_type pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
+          index_type i0 = pos << 1;
+          index_type i1 = i0 + 1;
+
+          real_type D0 = s_data[i0];
+          real_type D1 = s_data[i1];
+          s_data[i0] = D0 + D1;
+          s_data[i1] = D0 - D1;
+        }
+      }
+
+      cooperative_groups::sync(cta);
+
+      for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) {
+        d_Dst[pos] = s_data[pos];
+      }
+    }
 
-  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-  unsigned int t = threadIdx.x;
-  __shared__ double s_alpha[maxk];
-  if(t < k) {
-    s_alpha[t] = alpha[t];
+  } // namespace kernels
+
+
+  //
+  // Kernel wrappers
+  //
+
+  /**
+   * @brief Computes result = mvec^T * [vec1 vec2]
+   * 
+   * @param n      - size of vectors vec1, vec2
+   * @param i      - 
+   * @param vec1   - (n x 1) vector 
+   * @param vec2   - (n x 1) vector
+   * @param mvec   - (n x (i+1)) multivector
+   * @param result - ((i+1) x 2) multivector
+   * 
+   * @todo Input data should be `const`.
+   * @todo Is it coincidence that the block size is equal to the default
+   * value of Tv5?
+   * @todo Should we use dynamic shared memory here instead?
+   */
+  void mass_inner_product_two_vectors(index_type n, 
+                                      index_type i, 
+                                      const real_type* vec1, 
+                                      const real_type* vec2, 
+                                      const real_type* mvec, 
+                                      real_type* result)
+  {
+    kernels::MassIPTwoVec<<<i + 1, 1024>>>(vec1, vec2, mvec, result, i + 1, n);
   }
-  __syncthreads();
 
-  if(i < N) {
-    double temp = 0.0f;
-    for(int j = 0; j < k; ++j) {
-      temp += x_data[j * N + i] * s_alpha[j];
-    }
-    y_data[i] -= temp;
-  }
-}
-
-__global__ void matrixInfNormPart1(const int n, 
-                                   const int nnz, 
-                                   const int* a_ia,
-                                   const double* a_val, 
-                                   double* result) {
-
-  // one thread per row, pass through rows
-  // and sum
-  // can be done through atomics
-  //\sum_{j=1}^m abs(a_{ij})
-
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx < n){
-    double sum = 0.0f;
-    for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
-      sum = sum + fabs(a_val[i]);
-    }
-    result[idx] = sum;
-    idx += (blockDim.x*gridDim.x);
-  }
-}
-
-// for count sketch sketching random method
-__global__ void  count_sketch_kernel(const int n,
-                                     const int k, 
-                                     const int* labels,
-                                     const int* flip,
-                                     const double* input,
-                                     double* output){
-
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-  while (idx < n){
-    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
-    double val = input[idx];
-    if (flip[idx] != 1){
-      val *= -1.0;
-    }    
-    atomicAdd(&output[labels[idx]], val);
-    idx += blockDim.x * gridDim.x;
+  /**
+   * @brief Computes y := y - x*alpha
+   * 
+   * @param[in]  n     - vector size
+   * @param[in]  i     - number of vectors in the multivector
+   * @param[in]  x     - (n x (i+1)) multivector 
+   * @param[out] y     - (n x (i+1)) multivector
+   * @param[in]  alpha - ((i+1) x 1) vector
+   */
+  void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha)
+  {
+    kernels::massAxpy3<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha);
   }
-}
-
-// for Walsh-Hadamard transform
 
-//kernel 1
-__global__ void select_kernel(const int k, 
-                              const int* perm,
-                              const double* input,
-                              double* output){
-
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-  while (idx < k){
-    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
-    output[idx] = input[perm[idx]];
-    idx += blockDim.x * gridDim.x;
+  /**
+   * @brief 
+   * 
+   * @param[in]  n      -
+   * @param[in]  nnz    -
+   * @param[in]  a_ia   - 
+   * @param[in]  a_val  -
+   * @param[out] result -
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void matrix_row_sums(index_type n, 
+                       index_type nnz, 
+                       const index_type* a_ia,
+                       const real_type* a_val, 
+                       real_type* result)
+  {
+    kernels::matrixInfNormPart1<<<1000, 1024>>>(n, nnz, a_ia, a_val, result);
   }
-}
-
-//kernel 2
-__global__ void scaleByD_kernel(const int n,
-                                const int* D,
-                                const double* x,
-                                double* y){
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-
-  while (idx < n){
-
-    if (D[idx] == 1) y[idx]=x[idx];
-    else y[idx]= (-1.0)*x[idx];
 
-    idx += blockDim.x * gridDim.x;
-  }
-}
-
-//kernels 3 and 4
-
-
-#define ELEMENTARY_LOG2SIZE 11
-namespace cg = cooperative_groups;
-////////////////////////////////////////////////////////////////////////////////
-// Single in-global memory radix-4 Fast Walsh Transform pass
-// (for strides exceeding elementary vector size)
-////////////////////////////////////////////////////////////////////////////////
-
-__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) 
-{
-  const int pos = blockIdx.x * blockDim.x + threadIdx.x;
-  const int N = blockDim.x * gridDim.x * 4;
-
-  double* d_Src = d_Input + blockIdx.y * N;
-  double* d_Dst = d_Output + blockIdx.y * N;
-
-  int lo = pos & (stride - 1);
-  int i0 = ((pos - lo) << 2) + lo;
-  int i1 = i0 + stride;
-  int i2 = i1 + stride;
-  int i3 = i2 + stride;
-
-  double D0 = d_Src[i0];
-  double D1 = d_Src[i1];
-  double D2 = d_Src[i2];
-  double D3 = d_Src[i3];
-
-  double T;
-  T = D0;
-  D0 = D0 + D2;
-  D2 = T - D2;
-  T = D1;
-  D1 = D1 + D3;
-  D3 = T - D3;
-  T = D0;
-  d_Dst[i0] = D0 + D1;
-  d_Dst[i1] = T - D1;
-  T = D2;
-  d_Dst[i2] = D2 + D3;
-  d_Dst[i3] = T - D3;
-}
-
-
-__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) 
-{
-  // Handle to thread block group
-
-  cg::thread_block cta = cg::this_thread_block();
-  const int N = 1 << log2N;
-  const int base = blockIdx.x << log2N;
-
-  //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
-  extern __shared__ double s_data[];
-  double* d_Src = d_Input + base;
-  double* d_Dst = d_Output + base;
-
-  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
-    s_data[pos] = d_Src[pos];
+  /**
+   * @brief Kernel wrapper for 
+   * 
+   * @param[in]  n      - 
+   * @param[in]  k      - 
+   * @param[in]  labels - 
+   * @param[in]  flip   -
+   * @param[in]  input  - 
+   * @param[out] output - 
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void count_sketch_theta(index_type n,
+                          index_type k,
+                          const index_type* labels,
+                          const index_type* flip,
+                          const real_type* input,
+                          real_type* output)
+  {
+    kernels::count_sketch<<<10000, 1024>>>(n, k, labels, flip, input, output);
   }
 
-  // Main radix-4 stages
-  const int pos = threadIdx.x;
-
-  for (int stride = N >> 2; stride > 0; stride >>= 2) {
-    int lo = pos & (stride - 1);
-    int i0 = ((pos - lo) << 2) + lo;
-    int i1 = i0 + stride;
-    int i2 = i1 + stride;
-    int i3 = i2 + stride;
-
-    cg::sync(cta);
-    double D0 = s_data[i0];
-    double D1 = s_data[i1];
-    double D2 = s_data[i2];
-    double D3 = s_data[i3];
-
-    double T;
-    T = D0;
-    D0 = D0 + D2;
-    D2 = T - D2;
-    T = D1;
-    D1 = D1 + D3;
-    D3 = T - D3;
-    T = D0;
-    s_data[i0] = D0 + D1;
-    s_data[i1] = T - D1;
-    T = D2;
-    s_data[i2] = D2 + D3;
-    s_data[i3] = T - D3;
+  /**
+   * @brief Wrapper for `select` kernel, part of Walsh-Hadamard transform
+   * 
+   * @param[in]  k      -
+   * @param[in]  perm   - 
+   * @param[in]  input  -
+   * @param[out] output - 
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void FWHT_select(index_type k,
+                   const index_type* perm,
+                   const real_type* input,
+                   real_type* output)
+  {
+    kernels::select<<<1000, 1024>>>(k, perm, input, output);
   }
 
-  // Do single radix-2 stage for odd power of two
-  if (log2N & 1) {
-
-    cg::sync(cta);
+  /**
+   * @brief Wrapper for `scale` kernel, part of Walsh-Hadamard transform
+   * 
+   * @param[in]  n -
+   * @param[in]  D -
+   * @param[in]  x -
+   * @param[out] y -
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void FWHT_scaleByD(index_type n,
+                     const index_type* D,
+                     const real_type* x,
+                     real_type* y)
+  {
+    kernels::scaleByD<<<1000, 1024>>>(n, D, x, y);
+  }
 
-    for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
-      int i0 = pos << 1;
-      int i1 = i0 + 1;
+  /**
+   * @brief 
+   * 
+   * @param[in] M       -
+   * @param[in] log2N   - 
+   * @param[out] d_Data - 
+   * 
+   * @todo Decide if and how user should configure log2size, thread_n, etc.
+   */
+  void FWHT(index_type M, index_type log2N, real_type* d_Data)
+  {
+    const index_type ELEMENTARY_LOG2SIZE = 11;
+    const index_type THREAD_N = 1024;
+    index_type N = 1 << log2N;
+    dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
 
-      double D0 = s_data[i0];
-      double D1 = s_data[i1];
-      s_data[i0] = D0 + D1;
-      s_data[i1] = D0 - D1;
+    for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
+      kernels::fwtBatch2Kernel<<<grid, THREAD_N>>>(d_Data, d_Data, N / 4);
     }
-  }
 
-  cg::sync(cta);
-
-  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
-    d_Dst[pos] = s_data[pos];
-  }
-}
-
-void mass_inner_product_two_vectors(int n, 
-                                    int i, 
-                                    double* vec1, 
-                                    double* vec2, 
-                                    double* mvec, 
-                                    double* result)
-{
-  MassIPTwoVec_kernel<<<i + 1, 1024>>>(vec1, vec2, mvec, result, i + 1, n);
-}
-
-void mass_axpy(int n, int i, double* x, double* y, double* alpha)
-{
-  massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha);
-}
-
-void matrix_row_sums(int n, 
-                     int nnz, 
-                     int* a_ia,
-                     double* a_val, 
-                     double* result)
-{
-  matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result);
-}
-
-void  count_sketch_theta(int n,
-                         int k,
-                         int* labels,
-                         int* flip,
-                         double* input,
-                         double* output)
-{
-
-  count_sketch_kernel<<<10000, 1024>>>(n, k, labels, flip, input, output);
-}
-
-void FWHT_select(int k,
-                 int* perm,
-                 double* input,
-                 double* output)
-{
-  select_kernel<<<1000,1024>>>(k, perm, input, output);
-}
-
-void FWHT_scaleByD(int n,
-                   int* D,
-                   double* x,
-                   double* y)
-{
-  scaleByD_kernel<<<1000,1024>>>(n, D, x, y);
-}
-
-void FWHT(int M, int log2N, double* d_Data) {
-
-  const int THREAD_N = 1024;
-  int N = 1 << log2N;
-  dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
-
-  for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
-    fwtBatch2Kernel<<<grid, THREAD_N>>>(d_Data, d_Data, N / 4);
+    kernels::fwtBatch1Kernel<<<M, N / 4, N * sizeof(real_type)>>>(d_Data, d_Data, log2N);
   }
 
-  fwtBatch1Kernel<<<M, N / 4, N * sizeof(double)>>>(d_Data, d_Data, log2N);
-}
+} // namespace ReSolve
diff --git a/resolve/cuda/cudaKernels.h b/resolve/cuda/cudaKernels.h
index 460118f9..7b27b9ce 100644
--- a/resolve/cuda/cudaKernels.h
+++ b/resolve/cuda/cudaKernels.h
@@ -1,34 +1,52 @@
-void mass_inner_product_two_vectors(int n, 
-                                    int i, 
-                                    double* vec1, 
-                                    double* vec2, 
-                                    double* mvec, 
-                                    double* result);
-void mass_axpy(int n, int i, double* x, double* y, double* alpha);
-
-//needed for matrix inf nrm
-void matrix_row_sums(int n, 
-                     int nnz, 
-                     int* a_ia,
-                     double* a_val, 
-                     double* result);
-
-// needed for rand solver
-void  count_sketch_theta(int n,
-                         int k,
-                         int* labels,
-                         int* flip,
-                         double* input,
-                         double* output);
-
-void FWHT_select(int k,
-                 int* perm,
-                 double* input,
-                 double* output);
-
-void FWHT_scaleByD(int n,
-                   int* D,
-                   double* x,
-                   double* y);
-
-void FWHT(int M, int log2N, double* d_Data); 
+/**
+ * @file cudaKernels.h
+ * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
+ * 
+ * @brief Contains prototypes of CUDA kernels.
+ * 
+ * @note These kernels will be used in CUDA specific code, only.
+ * 
+ */
+#pragma once
+
+#include <resolve/Common.hpp>
+
+namespace ReSolve {
+
+  void mass_inner_product_two_vectors(index_type n, 
+                                      index_type i, 
+                                      const real_type* vec1, 
+                                      const real_type* vec2, 
+                                      const real_type* mvec, 
+                                      real_type* result);
+
+  void mass_axpy(index_type n, index_type i, const real_type* x, real_type* y, const real_type* alpha);
+
+  //needed for matrix inf nrm
+  void matrix_row_sums(index_type n, 
+                       index_type nnz, 
+                       const index_type* a_ia,
+                       const real_type* a_val, 
+                       real_type* result);
+
+  // needed for rand solver
+  void  count_sketch_theta(index_type n,
+                           index_type k,
+                           const index_type* labels,
+                           const index_type* flip,
+                           const real_type* input,
+                           real_type* output);
+
+  void FWHT_select(index_type k,
+                   const index_type* perm,
+                   const real_type* input,
+                   real_type* output);
+
+  void FWHT_scaleByD(index_type n,
+                     const index_type* D,
+                     const real_type* x,
+                     real_type* y);
+
+  void FWHT(index_type M, index_type log2N, real_type* d_Data); 
+
+} // namespace ReSolve
diff --git a/resolve/cuda/cudaVectorKernels.cu b/resolve/cuda/cudaVectorKernels.cu
deleted file mode 100644
index a1c53198..00000000
--- a/resolve/cuda/cudaVectorKernels.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "cudaVectorKernels.h"
-#include <resolve/vector/VectorKernels.hpp>
-
-
-namespace ReSolve { namespace vector {
-
-namespace kernels {
-
-__global__ void set_const(index_type n, real_type val, real_type* arr)
-{
-  index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  if(i < n)
-  {
-    arr[i] = val;
-  }
-}
-
-} // namespace kernels
-
-void set_array_const(index_type n, real_type val, real_type* arr)
-{
-  index_type num_blocks;
-  index_type block_size = 512;
-  num_blocks = (n + block_size - 1) / block_size;
-  kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
-}
-
-}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/cuda/cudaVectorKernels.h b/resolve/cuda/cudaVectorKernels.h
deleted file mode 100644
index e85016e3..00000000
--- a/resolve/cuda/cudaVectorKernels.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-// #include <cusparse.h>
-// #include <cublas_v2.h>
-// #include <cusolverSp_LOWLEVEL_PREVIEW.h>
-
-#include <resolve/Common.hpp>
-
-//***************************************************************************//
-//**** See VectorKernels.hpp for kernel wrapper functions documentation  ****//
-//***************************************************************************//
-
-namespace ReSolve { namespace vector {
-
-namespace kernels {
-  // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //    index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*);
-
-  // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //    index_type*, real_type*, index_type*, index_type*, real_type*);
-
-  // __global__ void add_const(index_type, index_type, index_type*);
-
-  /**
-   * @brief CUDA kernel that sets values of an array to a constant.
-   *
-   * @param[in]  n   - length of the array
-   * @param[in]  val - the value the array is set to
-   * @param[out] arr - a pointer to the array
-   * 
-   * @pre  `arr` is allocated to size `n`
-   * @post `arr` elements are set to `val`
-   */
-  __global__ void set_const(index_type n, real_type val, real_type* arr);
-
-  // __global__ void add_vecs(index_type, real_type*, real_type, real_type*);
-
-  // __global__ void mult_const(index_type, real_type, real_type*);
-
-  // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*);
-
-  // __global__ void inv_vec_scale(index_type, real_type*, real_type*);
-
-  // __global__ void vec_scale(index_type, real_type*, real_type*);
-
-  // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*,
-  //   real_type*, index_type*, index_type*, real_type*, index_type*, index_type*);
-
-  // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*,
-  //     real_type*, real_type*);
-
-  // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //   index_type*, real_type*, real_type*, real_type*, index_type);
-
-  // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*,
-  //    real_type* scale);
-} // namespace kernels
-
-}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt
index fb71a3bd..86755c0f 100644
--- a/resolve/hip/CMakeLists.txt
+++ b/resolve/hip/CMakeLists.txt
@@ -8,13 +8,12 @@
 
 set(ReSolve_HIP_SRC
     hipKernels.hip
-    hipVectorKernels.hip
+    VectorKernels.hip
     MemoryUtils.hip
 )
 
 set(ReSolve_HIP_HEADER_INSTALL
     hipKernels.h
-    hipVectorKernels.h
     HipMemory.hpp
     hip_check_errors.hpp
 )
diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/VectorKernels.hip
similarity index 72%
rename from resolve/hip/hipVectorKernels.hip
rename to resolve/hip/VectorKernels.hip
index 5b3ace30..30b5e982 100644
--- a/resolve/hip/hipVectorKernels.hip
+++ b/resolve/hip/VectorKernels.hip
@@ -1,3 +1,13 @@
+/**
+ * @file hipVectorKernels.hip
+ * @author Slaven Peles (peless@ornl.gov)
+ * @brief Contains implementation of CUDA vector kernels and their wrappers.
+ * @date 2023-12-08
+ * 
+ * @note Kernel wrappers implemented here are intended for use in hardware
+ * agnostic code.
+ */
+
 #include <resolve/Common.hpp>
 #include <resolve/vector/VectorKernels.hpp>
 #include <hip/hip_runtime.h>
diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h
index f7416b17..e6f547b6 100644
--- a/resolve/hip/hipKernels.h
+++ b/resolve/hip/hipKernels.h
@@ -1,51 +1,68 @@
-void mass_inner_product_two_vectors(int n, 
-                                    int i, 
-                                    double* vec1, 
-                                    double* vec2, 
-                                    double* mvec, 
-                                    double* result);
-void mass_axpy(int n, int i, double* x, double* y, double* alpha);
-
-//needed for matrix inf nrm
-void matrix_row_sums(int n, 
-                     int nnz, 
-                     int* a_ia,
-                     double* a_val, 
-                     double* result);
-
-// needed for triangular solve
-
-void permuteVectorP(int n, 
-                    int* perm_vector,
-                    double* vec_in, 
-                    double* vec_out);
-
-void permuteVectorQ(int n, 
-                    int* perm_vector,
-                    double* vec_in, 
-                    double* vec_out);
-
-// needed for rand solver
-void  count_sketch_theta(int n,
-                         int k,
-                         int* labels,
-                         int* flip,
-                         double* input,
-                         double* output);
-
-void FWHT_select(int k,
-                 int* perm,
-                 double* input,
-                 double* output);
-
-void FWHT_scaleByD(int n,
-                   int* D,
-                   double* x,
-                   double* y);
-
-void FWHT(int M, int log2N, double* d_Data); 
-
-void vector_inf_norm(int n,  
-                     double* input,
-                     double * buffer, 
-                     double* result);
+/**
+ * @file hipKernels.h
+ * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
+ * @brief Contains prototypes of HIP kernels.
+ * @date 2023-12-08
+ * 
+ * @note These kernels will be used in HIP specific code, only.
+ * 
+ */
+
+#pragma once
+
+#include <resolve/Common.hpp>
+
+namespace ReSolve {
+
+  void mass_inner_product_two_vectors(index_type n, 
+                                      index_type i, 
+                                      real_type* vec1, 
+                                      real_type* vec2, 
+                                      real_type* mvec, 
+                                      real_type* result);
+  void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha);
+
+  //needed for matrix inf nrm
+  void matrix_row_sums(index_type n, 
+                       index_type nnz, 
+                       index_type* a_ia,
+                       real_type* a_val, 
+                       real_type* result);
+
+  // needed for triangular solve
+
+  void permuteVectorP(index_type n, 
+                      index_type* perm_vector,
+                      real_type* vec_in, 
+                      real_type* vec_out);
+
+  void permuteVectorQ(index_type n, 
+                      index_type* perm_vector,
+                      real_type* vec_in, 
+                      real_type* vec_out);
+
+  // needed for rand solver
+  void  count_sketch_theta(index_type n,
+                           index_type k,
+                           index_type* labels,
+                           index_type* flip,
+                           real_type* input,
+                           real_type* output);
+
+  void FWHT_select(index_type k,
+                   index_type* perm,
+                   real_type* input,
+                   real_type* output);
+
+  void FWHT_scaleByD(index_type n,
+                     index_type* D,
+                     real_type* x,
+                     real_type* y);
+
+  void FWHT(index_type M, index_type log2N, real_type* d_Data); 
+
+  void vector_inf_norm(index_type n,  
+                       real_type* input,
+                       real_type * buffer, 
+                       real_type* result);
+} // namespace ReSolve
diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
index 671311d4..71c8171c 100644
--- a/resolve/hip/hipKernels.hip
+++ b/resolve/hip/hipKernels.hip
@@ -1,504 +1,743 @@
+/**
+ * @file hipKernels.hip
+ * @author Kasia Swirydowicz (kasia.swirydowicz@pnnl.gov)
+ * @brief 
+ * @date 2023-12-08
+ * 
+ * 
+ */
+
 #include "hipKernels.h"
 
 #include <hip/hip_runtime.h>
 #include <hip/hip_cooperative_groups.h>
 
-//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
-template <size_t Tv5 = 1024>
-__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, 
-                                    const double* __restrict__ u2, 
-                                    const double* __restrict__ v, 
-                                    double* result,
-                                    const int k, 
-                                    const int N)
-{
-  int t = threadIdx.x;
-  int bsize = blockDim.x;
-
-  // assume T threads per thread block (and k reductions to be performed)
-  volatile __shared__ double s_tmp1[Tv5];
-
-  volatile __shared__ double s_tmp2[Tv5];
-  // map between thread index space and the problem index space
-  int j = blockIdx.x;
-  s_tmp1[t] = 0.0;
-  s_tmp2[t] = 0.0;
-  int nn = t;
-  double can1, can2, cbn;
-
-  while (nn < N) {
-    can1 = u1[nn];
-    can2 = u2[nn];
-
-    cbn = v[N * j + nn];
-    s_tmp1[t] += can1 * cbn;
-    s_tmp2[t] += can2 * cbn;
-
-    nn += bsize;
-  }
-
-  __syncthreads();
-
-  if (Tv5 >= 1024) {
-    if(t < 512) {
-      s_tmp1[t] += s_tmp1[t + 512];
-      s_tmp2[t] += s_tmp2[t + 512];
+namespace ReSolve {
+  namespace kernels {
+
+    /**
+     * @brief Computes v^T * [u1 u2] where v is n x k multivector
+     * and u1 and u2 are n x 1 vectors.
+     * 
+     * @tparam Tv5 - Size of shared memory
+     *  
+     * @param[in] u1      - (n x 1) vector
+     * @param[in] u2      - (n x 1) vector
+     * @param[in] v       - (n x k) multivector
+     * @param[out] result - (k x 2) multivector
+     * @param[in] k       - dimension of the subspace
+     * @param[in] N       - size of vectors u1, u2
+     */
+    template <size_t Tv5 = 1024>
+    __global__ void MassIPTwoVec_kernel(const real_type* __restrict__ u1, 
+                                        const real_type* __restrict__ u2, 
+                                        const real_type* __restrict__ v, 
+                                        real_type* result,
+                                        const index_type k, 
+                                        const index_type N)
+    {
+      index_type t = threadIdx.x;
+      index_type bsize = blockDim.x;
+
+      // assume T threads per thread block (and k reductions to be performed)
+      volatile __shared__ real_type s_tmp1[Tv5];
+
+      volatile __shared__ real_type s_tmp2[Tv5];
+      // map between thread index space and the problem index space
+      index_type j = blockIdx.x;
+      s_tmp1[t] = 0.0;
+      s_tmp2[t] = 0.0;
+      index_type nn = t;
+      real_type can1, can2, cbn;
+
+      while (nn < N) {
+        can1 = u1[nn];
+        can2 = u2[nn];
+
+        cbn = v[N * j + nn];
+        s_tmp1[t] += can1 * cbn;
+        s_tmp2[t] += can2 * cbn;
+
+        nn += bsize;
+      }
+
+      __syncthreads();
+
+      if (Tv5 >= 1024) {
+        if(t < 512) {
+          s_tmp1[t] += s_tmp1[t + 512];
+          s_tmp2[t] += s_tmp2[t + 512];
+        }
+        __syncthreads();
+      }
+      if (Tv5 >= 512) {
+        if(t < 256) {
+          s_tmp1[t] += s_tmp1[t + 256];
+          s_tmp2[t] += s_tmp2[t + 256];
+        }
+        __syncthreads();
+      }
+      {
+        if (t < 128) {
+          s_tmp1[t] += s_tmp1[t + 128];
+          s_tmp2[t] += s_tmp2[t + 128];
+        }
+        __syncthreads();
+      }
+      {
+        if (t < 64) {
+          s_tmp1[t] += s_tmp1[t + 64];
+          s_tmp2[t] += s_tmp2[t + 64];
+        }
+        __syncthreads();
+      }
+
+      if (t < 32) {
+        s_tmp1[t] += s_tmp1[t + 32];
+        s_tmp2[t] += s_tmp2[t + 32];
+
+        s_tmp1[t] += s_tmp1[t + 16];
+        s_tmp2[t] += s_tmp2[t + 16];
+
+        s_tmp1[t] += s_tmp1[t + 8];
+        s_tmp2[t] += s_tmp2[t + 8];
+
+        s_tmp1[t] += s_tmp1[t + 4];
+        s_tmp2[t] += s_tmp2[t + 4];
+
+        s_tmp1[t] += s_tmp1[t + 2];
+        s_tmp2[t] += s_tmp2[t + 2];
+
+        s_tmp1[t] += s_tmp1[t + 1];
+        s_tmp2[t] += s_tmp2[t + 1];
+      }
+      if (t == 0) {
+        result[blockIdx.x] = s_tmp1[0];
+        result[blockIdx.x + k] = s_tmp2[0];
+      }
     }
-    __syncthreads();
-  }
-  if (Tv5 >= 512) {
-    if(t < 256) {
-      s_tmp1[t] += s_tmp1[t + 256];
-      s_tmp2[t] += s_tmp2[t + 256];
-    }
-    __syncthreads();
-  }
-  {
-    if (t < 128) {
-      s_tmp1[t] += s_tmp1[t + 128];
-      s_tmp2[t] += s_tmp2[t + 128];
-    }
-    __syncthreads();
-  }
-  {
-    if (t < 64) {
-      s_tmp1[t] += s_tmp1[t + 64];
-      s_tmp2[t] += s_tmp2[t + 64];
-    }
-    __syncthreads();
-  }
-
-  if (t < 32) {
-    s_tmp1[t] += s_tmp1[t + 32];
-    s_tmp2[t] += s_tmp2[t + 32];
 
-    s_tmp1[t] += s_tmp1[t + 16];
-    s_tmp2[t] += s_tmp2[t + 16];
 
-    s_tmp1[t] += s_tmp1[t + 8];
-    s_tmp2[t] += s_tmp2[t + 8];
+    /**
+     * @brief AXPY y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
+     * 
+     * @tparam Tmaxk 
+     * 
+     * @param[in]  N      -
+     * @param[in]  k      -
+     * @param[in]  x_data -
+     * @param[out] y_data -
+     * @param[in]  alpha  -
+     */
+    template <size_t Tmaxk = 1024>
+    __global__ void massAxpy3_kernel(index_type N,
+                                    index_type k,
+                                    const real_type* x_data,
+                                    real_type* y_data,
+                                    const real_type* alpha)
+    {
+      index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+      index_type t = threadIdx.x;
+
+      __shared__ real_type s_alpha[Tmaxk];
+      if (t < k) {
+        s_alpha[t] = alpha[t];
+      }
+      __syncthreads();
+      while (i < N) {
+        real_type temp = 0.0;
+        for(index_type j = 0; j < k; ++j) {
+          temp += x_data[j * N + i] * s_alpha[j];
+        }
+        y_data[i] -= temp;
+        i += (blockDim.x*gridDim.x);
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 4];
-    s_tmp2[t] += s_tmp2[t + 4];
+    /**
+     * @brief Pass through matrix rows and sum each as \sum_{j=1}^m abs(a_{ij})
+     * 
+     * @param[in]  n      -
+     * @param[in]  nnz    -
+     * @param[in]  a_ia   -
+     * @param[in]  a_val  -
+     * @param[out] result -
+     */
+    __global__ void matrixInfNormPart1(const index_type n, 
+                                       const index_type nnz, 
+                                       const index_type* a_ia,
+                                       const real_type* a_val, 
+                                       real_type* result)
+    {
+      index_type idx = blockIdx.x*blockDim.x + threadIdx.x;
+      while (idx < n) {
+        real_type sum = 0.0;
+        for (index_type i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
+          sum = sum + fabs(a_val[i]);
+        }
+        result[idx] = sum;
+        idx += (blockDim.x * gridDim.x);
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 2];
-    s_tmp2[t] += s_tmp2[t + 2];
+    /**
+     * @brief 
+     * 
+     * @param[in]  n      - vector size
+     * @param[in]  input  -
+     * @param[out] result - 
+     */
+    __global__ void vectorInfNorm(const index_type n, 
+                                  const real_type* input, 
+                                  real_type* result)
+    {
+
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+      volatile __shared__ real_type s_max[1024];
+      index_type t = threadIdx.x;
+      index_type bsize = blockDim.x;
+      real_type local_max = 0.0;
+      if (idx < n) {
+        local_max = fabs(input[idx]);
+      }
+
+      idx += (blockDim.x*gridDim.x);
+      
+      while (idx < n) {
+        local_max = fmax(fabs(input[idx]), local_max);
+        idx += (blockDim.x * gridDim.x);
+      }
+      s_max[t] = local_max;
+      __syncthreads();
+
+      // reduction
+
+      if (bsize >= 1024) {
+        if(t < 512) {
+          s_max[t] = fmax(s_max[t], s_max[t + 512]);
+        }
+        __syncthreads();
+      }
+      if (bsize >= 512) {
+        if(t < 256) {
+          s_max[t] = fmax(s_max[t], s_max[t + 256]);
+        }
+        __syncthreads();
+      }
+
+      if (bsize >= 256) {
+        if(t < 128) {
+          s_max[t] = fmax(s_max[t], s_max[t + 128]);
+        }
+        __syncthreads();
+      }
+
+      if (bsize >= 128) {
+        if(t < 64) {
+          s_max[t] = fmax(s_max[t], s_max[t + 64]);
+        }
+        __syncthreads();
+      }
+      //unroll for last warp
+      if (t < 64) {
+        s_max[t] = fmax(s_max[t], s_max[t + 64]);
+        s_max[t] = fmax(s_max[t], s_max[t + 32]);
+        s_max[t] = fmax(s_max[t], s_max[t + 16]);
+        s_max[t] = fmax(s_max[t], s_max[t + 8]);
+        s_max[t] = fmax(s_max[t], s_max[t + 4]);
+        s_max[t] = fmax(s_max[t], s_max[t + 2]);
+        s_max[t] = fmax(s_max[t], s_max[t + 1]);
+
+      }
+      if (t == 0) {
+        index_type bid = blockIdx.x;
+        index_type gid = gridDim.x;
+        result[blockIdx.x] = s_max[0];
+      }
+    }
 
-    s_tmp1[t] += s_tmp1[t + 1];
-    s_tmp2[t] += s_tmp2[t + 1];
-  }
-  if (t == 0) {
-    result[blockIdx.x] = s_tmp1[0];
-    result[blockIdx.x + k] = s_tmp2[0];
-  }
-}
 
+    /**
+     * @brief 
+     * 
+     * @param n           - vector size
+     * @param perm_vector - permutation map
+     * @param vec_in      - input vector
+     * @param vec_out     - permuted vector
+     */
+    __global__ void permuteVectorP_kernel(const index_type n, 
+                                          const index_type* perm_vector,
+                                          const real_type* vec_in, 
+                                          real_type* vec_out)
+    {
+      //one thread per vector entry, pass through rows
+      index_type idx = blockIdx.x*blockDim.x + threadIdx.x;
+      while (idx < n) {
+        vec_out[idx] = vec_in[perm_vector[idx]];
+        idx += (blockDim.x * gridDim.x);
+      }
+    }
 
-/// mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
-template <size_t Tmaxk = 1024>
-__global__ void massAxpy3_kernel(int N,
-                                 int k,
-                                 const double* x_data,
-                                 double* y_data,
-                                 const double* alpha)
-{
+    /**
+     * @brief 
+     * 
+     * @param n           - vector size
+     * @param perm_vector - permutation map
+     * @param vec_in      - input vector
+     * @param vec_out     - permuted vector
+     */
+    __global__ void permuteVectorQ_kernel(const index_type n, 
+                                          const index_type* perm_vector,
+                                          const real_type* vec_in, 
+                                          real_type* vec_out)
+    {
+      //one thread per vector entry, pass through rows
+      index_type idx = blockIdx.x*blockDim.x + threadIdx.x;
+      while (idx < n) {
+        vec_out[perm_vector[idx]] = vec_in[idx];
+        idx += (blockDim.x * gridDim.x);
+      }
+    }
 
-  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+    /**
+     * @brief For count sketch in sketching random method
+     * 
+     * @param[in]  n      -
+     * @param[in]  k      -
+     * @param[in]  labels -
+     * @param[in]  flip   -
+     * @param[in]  input  - 
+     * @param[out] output -
+     */
+    __global__ void  count_sketch_kernel(const index_type n,
+                                         const index_type k, 
+                                         const index_type* labels,
+                                         const index_type* flip,
+                                         const real_type* input,
+                                         real_type* output){
+
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+      while (idx < n) {
+        real_type val = input[idx];
+        if (flip[idx] != 1){
+          val *= -1.0;
+        }    
+        atomicAdd(&output[labels[idx]], val);
+        idx += blockDim.x * gridDim.x;
+      }
+    }
 
-  unsigned int t = threadIdx.x;
+    /**
+     * @brief Walsh-Hadamard transform (select kernel)
+     * 
+     * @param[in]  k      -
+     * @param[in]  perm   -
+     * @param[in]  input  -
+     * @param[out] output - 
+     */
+    __global__ void select_kernel(const index_type k, 
+                                  const index_type* perm,
+                                  const real_type* input,
+                                  real_type* output){
+
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+      while (idx < k) {
+        output[idx] = input[perm[idx]];
+        idx += blockDim.x * gridDim.x;
+      }
+    }
 
-  __shared__ double s_alpha[Tmaxk];
-  if (t < k) {
-    s_alpha[t] = alpha[t];
-  }
-  __syncthreads();
-  while (i < N) {
-    double temp = 0.0;
-    for(int j = 0; j < k; ++j) {
-      temp += x_data[j * N + i] * s_alpha[j];
+    /**
+     * @brief Walsh-Hadamard transform (scale)
+     * 
+     * @param[in]  n -
+     * @param[in]  D -
+     * @param[in]  x -
+     * @param[out] y -
+     */
+    __global__ void scaleByD_kernel(const index_type n,
+                                    const index_type* D,
+                                    const real_type* x,
+                                    real_type* y){
+      index_type idx = blockIdx.x * blockDim.x + threadIdx.x; 
+
+      while (idx < n){
+
+        if (D[idx] == 1) y[idx]=x[idx];
+        else y[idx]= (-1.0)*x[idx];
+
+        idx += blockDim.x * gridDim.x;
+      }
     }
-    y_data[i] -= temp;
-    i += (blockDim.x*gridDim.x);
-  }
-}
-
-__global__ void matrixInfNormPart1(const int n, 
-                                   const int nnz, 
-                                   const int* a_ia,
-                                   const double* a_val, 
-                                   double* result)
-{
-  // one thread per row, pass through rows
-  // and sum
-  // can be done through atomics
-  //\sum_{j=1}^m fabs(a_{ij})
-
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx < n) {
-    double sum = 0.0;
-    for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
-      sum = sum + fabs(a_val[i]);
+
+    /**
+     * @brief Single in-global memory radix-4 Fast Walsh Transform pass
+     * (for strides exceeding elementary vector size).
+     * 
+     * @param d_Output - 
+     * @param d_Input  -
+     * @param stride   -
+     */
+    __global__ void fwtBatch2Kernel(real_type* d_Output, real_type* d_Input, index_type stride) 
+    {
+      const index_type pos = blockIdx.x * blockDim.x + threadIdx.x;
+      const index_type N = blockDim.x * gridDim.x * 4;
+
+      real_type* d_Src = d_Input + blockIdx.y * N;
+      real_type* d_Dst = d_Output + blockIdx.y * N;
+
+      index_type lo = pos & (stride - 1);
+      index_type i0 = ((pos - lo) << 2) + lo;
+      index_type i1 = i0 + stride;
+      index_type i2 = i1 + stride;
+      index_type i3 = i2 + stride;
+
+      real_type D0 = d_Src[i0];
+      real_type D1 = d_Src[i1];
+      real_type D2 = d_Src[i2];
+      real_type D3 = d_Src[i3];
+
+      real_type T;
+      T = D0;
+      D0 = D0 + D2;
+      D2 = T - D2;
+      T = D1;
+      D1 = D1 + D3;
+      D3 = T - D3;
+      T = D0;
+      d_Dst[i0] = D0 + D1;
+      d_Dst[i1] = T - D1;
+      T = D2;
+      d_Dst[i2] = D2 + D3;
+      d_Dst[i3] = T - D3;
     }
-    result[idx] = sum;
-    idx += (blockDim.x*gridDim.x);
-  }
-}
 
-__global__ void vectorInfNorm(const int n, 
-                              const double* input, 
-                              double* result)
-{
 
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    /**
+     * @brief 
+     * 
+     * @param d_Output -
+     * @param d_Input  -
+     * @param log2N    -
+     * 
+     * @todo `d_Input` should be `const` parameter.
+     * 
+     */
+    __global__ void fwtBatch1Kernel(real_type* d_Output, real_type* d_Input, index_type log2N) 
+    {
+      // Handle to thread block group
+
+      cooperative_groups::thread_block cta = cooperative_groups::this_thread_block();
+      const index_type N = 1 << log2N;
+      const index_type base = blockIdx.x << log2N;
+
+      //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
+      extern __shared__ real_type s_data[];
+      real_type* d_Src = d_Input + base;
+      real_type* d_Dst = d_Output + base;
+
+      for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) {
+        s_data[pos] = d_Src[pos];
+      }
+
+      // Main radix-4 stages
+      const index_type pos = threadIdx.x;
+
+      for (index_type stride = N >> 2; stride > 0; stride >>= 2) {
+        index_type lo = pos & (stride - 1);
+        index_type i0 = ((pos - lo) << 2) + lo;
+        index_type i1 = i0 + stride;
+        index_type i2 = i1 + stride;
+        index_type i3 = i2 + stride;
+
+        cooperative_groups::sync(cta);
+        real_type D0 = s_data[i0];
+        real_type D1 = s_data[i1];
+        real_type D2 = s_data[i2];
+        real_type D3 = s_data[i3];
+
+        real_type T;
+        T = D0;
+        D0 = D0 + D2;
+        D2 = T - D2;
+        T = D1;
+        D1 = D1 + D3;
+        D3 = T - D3;
+        T = D0;
+        s_data[i0] = D0 + D1;
+        s_data[i1] = T - D1;
+        T = D2;
+        s_data[i2] = D2 + D3;
+        s_data[i3] = T - D3;
+      }
+
+      // Do single radix-2 stage for odd power of two
+      if (log2N & 1) {
+
+        cooperative_groups::sync(cta);
+
+        for (index_type pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
+          index_type i0 = pos << 1;
+          index_type i1 = i0 + 1;
+
+          real_type D0 = s_data[i0];
+          real_type D1 = s_data[i1];
+          s_data[i0] = D0 + D1;
+          s_data[i1] = D0 - D1;
+        }
+      }
+
+      cooperative_groups::sync(cta);
+
+      for (index_type pos = threadIdx.x; pos < N; pos += blockDim.x) {
+        d_Dst[pos] = s_data[pos];
+      }
+    }
 
-  volatile __shared__ double s_max[1024];
-  int t = threadIdx.x;
-  int bsize = blockDim.x;
-  double local_max = 0.0;
-  if (idx < n) {
-    local_max = fabs(input[idx]);
+  } // namespace kernels
+
+  //
+  // Kernel wrappers
+  //
+
+  /**
+   * @brief Computes result = mvec^T * [vec1 vec2]
+   * 
+   * @param n      - size of vectors vec1, vec2
+   * @param i      - 
+   * @param vec1   - (n x 1) vector 
+   * @param vec2   - (n x 1) vector
+   * @param mvec   - (n x (i+1)) multivector
+   * @param result - ((i+1) x 2) multivector
+   * 
+   * @todo Input data should be `const`.
+   * @todo Is it coincidence that the block size is equal to the default
+   * value of Tv5?
+   * @todo Should we use dynamic shared memory here instead?
+   */
+  void mass_inner_product_two_vectors(index_type n, 
+                                      index_type i, 
+                                      real_type* vec1, 
+                                      real_type* vec2, 
+                                      real_type* mvec, 
+                                      real_type* result)
+  {
+    hipLaunchKernelGGL(kernels::MassIPTwoVec_kernel,
+                       dim3(i + 1),
+                       dim3(1024),
+                       0,
+                       0,
+                       vec1,
+                       vec2,
+                       mvec,
+                       result,
+                       i + 1,
+                       n);
   }
 
-  idx += (blockDim.x*gridDim.x);
-  
-  while (idx < n) {
-    local_max = fmax(fabs(input[idx]), local_max);
-    idx += (blockDim.x*gridDim.x);
+  /**
+   * @brief Computes y := y - x*alpha
+   * 
+   * @param[in]  n     - vector size
+   * @param[in]  i     - number of vectors in the multivector
+   * @param[in]  x     - (n x (i+1)) multivector 
+   * @param[out] y     - (n x (i+1)) multivector
+   * @param[in]  alpha - ((i+1) x 1) vector
+   */
+  void mass_axpy(index_type n, index_type i, real_type* x, real_type* y, real_type* alpha)
+  {
+    hipLaunchKernelGGL(kernels::massAxpy3_kernel,
+                       dim3((n + 384 - 1) / 384),
+                       dim3(384),
+                       0,
+                       0,
+                       n,
+                       i,
+                       x,
+                       y,
+                       alpha);
   }
-  s_max[t] = local_max;
-  __syncthreads();
-
-  // reduction
 
-  if (bsize >= 1024) {
-    if(t < 512) {
-      s_max[t] = fmax(s_max[t], s_max[t + 512]);
-    }
-    __syncthreads();
-  }
-  if (bsize >= 512) {
-    if(t < 256) {
-      s_max[t] = fmax(s_max[t], s_max[t + 256]);
-    }
-    __syncthreads();
+  /**
+   * @brief 
+   * 
+   * @param[in]  n      -
+   * @param[in]  nnz    -
+   * @param[in]  a_ia   - 
+   * @param[in]  a_val  -
+   * @param[out] result -
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void matrix_row_sums(index_type n, 
+                       index_type nnz, 
+                       index_type* a_ia,
+                       real_type* a_val, 
+                       real_type* result)
+  {
+    hipLaunchKernelGGL(kernels::matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
   }
 
-  if (bsize >= 256) {
-    if(t < 128) {
-      s_max[t] = fmax(s_max[t], s_max[t + 128]);
-    }
-    __syncthreads();
+  /**
+   * @brief 
+   * 
+   * @param n      -
+   * @param input  -
+   * @param buffer - 
+   * @param result - 
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void vector_inf_norm(index_type n,  
+                       real_type* input,
+                       real_type* buffer, 
+                       real_type* result)
+  {
+    hipLaunchKernelGGL(kernels::vectorInfNorm, dim3(1024), dim3(1024), 0, 0, n, input, buffer);
+    hipDeviceSynchronize();
+    hipLaunchKernelGGL(kernels::vectorInfNorm, dim3(1), dim3(1024), 0, 0, 1024, buffer, buffer);
+    hipMemcpy(result, buffer, sizeof(real_type) * 1, hipMemcpyDeviceToHost);
   }
 
-  if (bsize >= 128) {
-    if(t < 64) {
-      s_max[t] = fmax(s_max[t], s_max[t + 64]);
-    }
-    __syncthreads();
+  /**
+   * @brief 
+   * 
+   * @param n           - vector size
+   * @param perm_vector - permutation map
+   * @param vec_in      - input vector
+   * @param vec_out     - permuted vector
+   */
+  void permuteVectorP(index_type n, 
+                      index_type* perm_vector,
+                      real_type* vec_in, 
+                      real_type* vec_out)
+  {
+    hipLaunchKernelGGL(kernels::permuteVectorP_kernel,
+                       dim3(1000),
+                       dim3(1024),
+                       0,
+                       0,
+                       n,
+                       perm_vector,
+                       vec_in,
+                       vec_out);
   }
-  //unroll for last warp
-  if (t < 64) {
-    s_max[t] = fmax(s_max[t], s_max[t + 64]);
-    s_max[t] = fmax(s_max[t], s_max[t + 32]);
-    s_max[t] = fmax(s_max[t], s_max[t + 16]);
-    s_max[t] = fmax(s_max[t], s_max[t + 8]);
-    s_max[t] = fmax(s_max[t], s_max[t + 4]);
-    s_max[t] = fmax(s_max[t], s_max[t + 2]);
-    s_max[t] = fmax(s_max[t], s_max[t + 1]);
 
+  /**
+   * @brief 
+   * 
+   * @param n           - vector size
+   * @param perm_vector - permutation map
+   * @param vec_in      - input vector
+   * @param vec_out     - permuted vector
+   */
+  void permuteVectorQ(index_type n, 
+                      index_type* perm_vector,
+                      real_type* vec_in, 
+                      real_type* vec_out)
+  {
+    hipLaunchKernelGGL(kernels::permuteVectorQ_kernel,
+                       dim3(1000),
+                       dim3(1024),
+                       0,
+                       0,
+                       n,
+                       perm_vector,
+                       vec_in,
+                       vec_out);
   }
-  if (t == 0) {
-    int bid = blockIdx.x;
-    int gid = gridDim.x;
-    result[blockIdx.x] = s_max[0];
-  }
-}
-
-
-__global__ void permuteVectorP_kernel(const int n, 
-                                      const int* perm_vector,
-                                      const double* vec_in, 
-                                      double* vec_out)
-{
-  //one thread per vector entry, pass through rows
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx<n) {
-    vec_out[idx] = vec_in[perm_vector[idx]];
-    idx += (blockDim.x*gridDim.x);
-  }
-}
-
-__global__ void permuteVectorQ_kernel(const int n, 
-                                      const int* perm_vector,
-                                      const double* vec_in, 
-                                      double* vec_out)
-{
-  //one thread per vector entry, pass through rows
-  int idx = blockIdx.x*blockDim.x + threadIdx.x;
-  while (idx<n) {
-    vec_out[perm_vector[idx]] = vec_in[idx];
-    idx += (blockDim.x*gridDim.x);
-  }
-}
-
-// for count sketch sketching random method
-__global__ void  count_sketch_kernel(const int n,
-                                     const int k, 
-                                     const int* labels,
-                                     const int* flip,
-                                     const double* input,
-                                     double* output){
-
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-  while (idx < n){
-    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
-    double val = input[idx];
-    if (flip[idx] != 1){
-      val *= -1.0;
-    }    
-    atomicAdd(&output[labels[idx]], val);
-    idx += blockDim.x * gridDim.x;
-  }
-}
 
-// for Walsh-Hadamard transform
-
-//kernel 1
-__global__ void select_kernel(const int k, 
-                              const int* perm,
-                              const double* input,
-                              double* output){
+  /**
+   * @brief Kernel wrapper for 
+   * 
+   * @param[in]  n      - 
+   * @param[in]  k      - 
+   * @param[in]  labels - 
+   * @param[in]  flip   -
+   * @param[in]  input  - 
+   * @param[out] output - 
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void  count_sketch_theta(index_type n,
+                           index_type k,
+                           index_type* labels,
+                           index_type* flip,
+                           real_type* input,
+                           real_type* output)
+  {
 
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-  while (idx < k){
-    //printf("in place %d, I am putting input[perm[%d]] = input[%d] = %f \n", idx,idx, perm[idx], input[perm[idx]] );
-    output[idx] = input[perm[idx]];
-    idx += blockDim.x * gridDim.x;
+    hipLaunchKernelGGL(kernels::count_sketch_kernel,
+                       dim3(10000),
+                       dim3(1024),
+                       0,
+                       0,
+                       n,
+                       k,
+                       labels,
+                       flip,
+                       input,
+                       output);
   }
-}
-
-//kernel 2
-__global__ void scaleByD_kernel(const int n,
-                                const int* D,
-                                const double* x,
-                                double* y){
-  int idx = blockIdx.x * blockDim.x + threadIdx.x; 
-
-  while (idx < n){
 
-    if (D[idx] == 1) y[idx]=x[idx];
-    else y[idx]= (-1.0)*x[idx];
-
-    idx += blockDim.x * gridDim.x;
-  }
-}
-
-//kernels 3 and 4
-
-
-#define ELEMENTARY_LOG2SIZE 11
-namespace cg = cooperative_groups;
-////////////////////////////////////////////////////////////////////////////////
-// Single in-global memory radix-4 Fast Walsh Transform pass
-// (for strides exceeding elementary vector size)
-////////////////////////////////////////////////////////////////////////////////
-
-__global__ void fwtBatch2Kernel(double* d_Output, double* d_Input, int stride) 
-{
-  const int pos = blockIdx.x * blockDim.x + threadIdx.x;
-  const int N = blockDim.x * gridDim.x * 4;
-
-  double* d_Src = d_Input + blockIdx.y * N;
-  double* d_Dst = d_Output + blockIdx.y * N;
-
-  int lo = pos & (stride - 1);
-  int i0 = ((pos - lo) << 2) + lo;
-  int i1 = i0 + stride;
-  int i2 = i1 + stride;
-  int i3 = i2 + stride;
-
-  double D0 = d_Src[i0];
-  double D1 = d_Src[i1];
-  double D2 = d_Src[i2];
-  double D3 = d_Src[i3];
-
-  double T;
-  T = D0;
-  D0 = D0 + D2;
-  D2 = T - D2;
-  T = D1;
-  D1 = D1 + D3;
-  D3 = T - D3;
-  T = D0;
-  d_Dst[i0] = D0 + D1;
-  d_Dst[i1] = T - D1;
-  T = D2;
-  d_Dst[i2] = D2 + D3;
-  d_Dst[i3] = T - D3;
-}
-
-
-__global__ void fwtBatch1Kernel(double* d_Output, double* d_Input, int log2N) 
-{
-  // Handle to thread block group
-
-  cg::thread_block cta = cg::this_thread_block();
-  const int N = 1 << log2N;
-  const int base = blockIdx.x << log2N;
-
-  //(2 ** 11) * 4 bytes == 8KB -- maximum s_data[] size for G80
-  extern __shared__ double s_data[];
-  double* d_Src = d_Input + base;
-  double* d_Dst = d_Output + base;
-
-  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
-    s_data[pos] = d_Src[pos];
+  /**
+   * @brief Wrapper for `select` kernel, part of Walsh-Hadamard transform
+   * 
+   * @param[in]  k      -
+   * @param[in]  perm   - 
+   * @param[in]  input  -
+   * @param[out] output - 
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void FWHT_select(index_type k,
+                   index_type* perm,
+                   real_type* input,
+                   real_type* output)
+  {
+    hipLaunchKernelGGL(kernels::select_kernel, dim3(1000), dim3(1024), 0, 0,k, perm, input, output);
   }
 
-  // Main radix-4 stages
-  const int pos = threadIdx.x;
-
-  for (int stride = N >> 2; stride > 0; stride >>= 2) {
-    int lo = pos & (stride - 1);
-    int i0 = ((pos - lo) << 2) + lo;
-    int i1 = i0 + stride;
-    int i2 = i1 + stride;
-    int i3 = i2 + stride;
-
-    cg::sync(cta);
-    double D0 = s_data[i0];
-    double D1 = s_data[i1];
-    double D2 = s_data[i2];
-    double D3 = s_data[i3];
-
-    double T;
-    T = D0;
-    D0 = D0 + D2;
-    D2 = T - D2;
-    T = D1;
-    D1 = D1 + D3;
-    D3 = T - D3;
-    T = D0;
-    s_data[i0] = D0 + D1;
-    s_data[i1] = T - D1;
-    T = D2;
-    s_data[i2] = D2 + D3;
-    s_data[i3] = T - D3;
+  /**
+   * @brief Wrapper for `scale` kernel, part of Walsh-Hadamard transform
+   * 
+   * @param[in]  n -
+   * @param[in]  D -
+   * @param[in]  x -
+   * @param[out] y -
+   * 
+   * @todo Decide how to allow user to configure grid and block sizes.
+   */
+  void FWHT_scaleByD(index_type n,
+                     index_type* D,
+                     real_type* x,
+                     real_type* y)
+  {
+    hipLaunchKernelGGL(kernels::scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y);
   }
 
-  // Do single radix-2 stage for odd power of two
-  if (log2N & 1) {
-
-    cg::sync(cta);
-
-    for (int pos = threadIdx.x; pos < N / 2; pos += blockDim.x) {
-      int i0 = pos << 1;
-      int i1 = i0 + 1;
+  /**
+   * @brief 
+   * 
+   * @param[in] M       -
+   * @param[in] log2N   - 
+   * @param[out] d_Data - 
+   * 
+   * @todo Decide if and how user should configure log2size, thread_n, etc.
+   */
+  void FWHT(index_type M, index_type log2N, real_type* d_Data)
+  {
+    const index_type ELEMENTARY_LOG2SIZE = 11;
+    const index_type THREAD_N = 1024;
+    index_type N = 1 << log2N;
+    dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
 
-      double D0 = s_data[i0];
-      double D1 = s_data[i1];
-      s_data[i0] = D0 + D1;
-      s_data[i1] = D0 - D1;
+    for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
+      hipLaunchKernelGGL(kernels::fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0,  d_Data, d_Data, N / 4);
     }
-  }
-
-  cg::sync(cta);
 
-  for (int pos = threadIdx.x; pos < N; pos += blockDim.x) {
-    d_Dst[pos] = s_data[pos];
-  }
-}
-
-
-
-void mass_inner_product_two_vectors(int n, 
-                                    int i, 
-                                    double* vec1, 
-                                    double* vec2, 
-                                    double* mvec, 
-                                    double* result)
-{
-  hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
-}
-
-void mass_axpy(int n, int i, double* x, double* y, double* alpha)
-{
-  hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
-}
-
-void matrix_row_sums(int n, 
-                     int nnz, 
-                     int* a_ia,
-                     double* a_val, 
-                     double* result)
-{
-  hipLaunchKernelGGL(matrixInfNormPart1, dim3(1000), dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
-}
-
-
-void vector_inf_norm(int n,  
-                     double* input,
-                     double* buffer, 
-                     double* result)
-{
-  hipLaunchKernelGGL(vectorInfNorm, dim3(1024), dim3(1024), 0, 0, n, input, buffer);
-  hipDeviceSynchronize();
-  hipLaunchKernelGGL(vectorInfNorm, dim3(1), dim3(1024), 0, 0, 1024, buffer, buffer);
-  hipMemcpy(result, buffer, sizeof(double) * 1, hipMemcpyDeviceToHost);
-}
-
-void permuteVectorP(int n, 
-                    int* perm_vector,
-                    double* vec_in, 
-                    double* vec_out)
-{
-  hipLaunchKernelGGL(permuteVectorP_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
-}
-
-void permuteVectorQ(int n, 
-                    int* perm_vector,
-                    double* vec_in, 
-                    double* vec_out)
-{
-  hipLaunchKernelGGL(permuteVectorQ_kernel, dim3(1000), dim3(1024), 0, 0, n, perm_vector, vec_in, vec_out);
-}
-
-void  count_sketch_theta(int n,
-                         int k,
-                         int* labels,
-                         int* flip,
-                         double* input,
-                         double* output)
-{
-
-  hipLaunchKernelGGL(count_sketch_kernel, dim3(10000), dim3(1024), 0, 0, n, k, labels, flip, input, output);
-}
-
-void FWHT_select(int k,
-                 int* perm,
-                 double* input,
-                 double* output)
-{
-  hipLaunchKernelGGL(select_kernel, dim3(1000), dim3(1024), 0, 0,k, perm, input, output);
-}
-
-void FWHT_scaleByD(int n,
-                   int* D,
-                   double* x,
-                   double* y)
-{
-  hipLaunchKernelGGL(scaleByD_kernel, dim3(1000), dim3(1024), 0, 0, n, D, x, y);
-}
-
-void FWHT(int M, int log2N, double* d_Data) {
-
-  const int THREAD_N = 1024;
-  int N = 1 << log2N;
-  dim3 grid((1 << log2N) / (4 * THREAD_N), M, 1);
-
-  for (; log2N > ELEMENTARY_LOG2SIZE; log2N -= 2, N >>= 2, M <<= 2) {
-    hipLaunchKernelGGL(fwtBatch2Kernel, grid, dim3(THREAD_N), 0, 0,  d_Data, d_Data, N / 4);
+    hipLaunchKernelGGL(kernels::fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(real_type), 0, d_Data, d_Data, log2N);
   }
 
-  hipLaunchKernelGGL(fwtBatch1Kernel, dim3(M), dim3(N / 4), N * sizeof(double), 0, d_Data, d_Data, log2N);
-}
+} // namespace ReSolve
diff --git a/resolve/hip/hipVectorKernels.h b/resolve/hip/hipVectorKernels.h
deleted file mode 100644
index cd23f822..00000000
--- a/resolve/hip/hipVectorKernels.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#pragma once
-
-#include <hip/hip_runtime.h>
-
-#include <resolve/Common.hpp>
-
-//***************************************************************************//
-//**** See VectorKernels.hpp for kernel wrapper functions documentation  ****//
-//***************************************************************************//
-
-namespace ReSolve { namespace vector {
-
-namespace kernels {
-  // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //    index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*);
-
-  // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //    index_type*, real_type*, index_type*, index_type*, real_type*);
-
-  // __global__ void add_const(index_type, index_type, index_type*);
-
-  /**
-   * @brief CUDA kernel that sets values of an array to a constant.
-   *
-   * @param[in]  n   - length of the array
-   * @param[in]  val - the value the array is set to
-   * @param[out] arr - a pointer to the array
-   * 
-   * @pre  `arr` is allocated to size `n`
-   * @post `arr` elements are set to `val`
-   */
-  __global__ void set_const(index_type n, real_type val, real_type* arr);
-
-  // __global__ void add_vecs(index_type, real_type*, real_type, real_type*);
-
-  // __global__ void mult_const(index_type, real_type, real_type*);
-
-  // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*);
-
-  // __global__ void inv_vec_scale(index_type, real_type*, real_type*);
-
-  // __global__ void vec_scale(index_type, real_type*, real_type*);
-
-  // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*,
-  //   real_type*, index_type*, index_type*, real_type*, index_type*, index_type*);
-
-  // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*,
-  //     real_type*, real_type*);
-
-  // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
-  //   index_type*, real_type*, real_type*, real_type*, index_type);
-
-  // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*,
-  //    real_type* scale);
-} // namespace kernels
-
-}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/vector/VectorKernels.hpp b/resolve/vector/VectorKernels.hpp
index 9f7d1bca..8e69ff38 100644
--- a/resolve/vector/VectorKernels.hpp
+++ b/resolve/vector/VectorKernels.hpp
@@ -1,4 +1,14 @@
+/**
+ * @file VectorKernels.hpp
+ * @author Slaven Peles (peless@ornl.gov)
+ * @brief Hardware agnostic prototypes of GPU vector kernels
+ * @date 2023-12-08
+ * 
+ * 
+ */
+
 #pragma once
+#include <resolve/Common.hpp>
 
 namespace ReSolve { namespace vector {