From ea279a898c40fbad9d5a9adfb89b4bfd52a6854f Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Wed, 25 Oct 2023 22:18:26 -0400
Subject: [PATCH 01/12] First attempt at HIP implemetation

---
 CMakeLists.txt                                |  27 +++-
 cmake/ReSolveFindHipLibraries.cmake           |  21 +++
 examples/r_KLU_GLU.cpp                        |   7 +-
 examples/r_KLU_GLU_matrix_values_update.cpp   |   7 +-
 examples/r_KLU_KLU.cpp                        |   7 +-
 examples/r_KLU_KLU_standalone.cpp             |   7 +-
 examples/r_KLU_rf.cpp                         |   7 +-
 examples/r_KLU_rf_FGMRES.cpp                  |   3 +
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |   3 +
 resolve/CMakeLists.txt                        |  13 ++
 resolve/MemoryUtils.hpp                       |   3 +-
 resolve/hip/CMakeLists.txt                    |  37 +++++
 resolve/hip/HipMemory.hpp                     | 152 ++++++++++++++++++
 resolve/hip/MemoryUtils.hip                   |  40 +++++
 resolve/hip/hipVectorKernels.h                |  57 +++++++
 resolve/hip/hipVectorKernels.hip              |  29 ++++
 resolve/hip/hip_check_errors.hpp              |  28 ++++
 resolve/resolve_defs.hpp.in                   |  14 +-
 tests/functionality/testKLU.cpp               |   5 +-
 tests/functionality/testKLU_GLU.cpp           |   5 +-
 tests/functionality/testKLU_Rf.cpp            |   5 +-
 tests/functionality/testKLU_Rf_FGMRES.cpp     |   5 +-
 tests/unit/CMakeLists.txt                     |   1 +
 tests/unit/memory/CMakeLists.txt              |  21 +++
 tests/unit/memory/MemoryUtilsTests.hpp        | 110 +++++++++++++
 tests/unit/memory/runMemoryUtilsTests.cpp     |  36 +++++
 26 files changed, 620 insertions(+), 30 deletions(-)
 create mode 100644 cmake/ReSolveFindHipLibraries.cmake
 create mode 100644 resolve/hip/CMakeLists.txt
 create mode 100644 resolve/hip/HipMemory.hpp
 create mode 100644 resolve/hip/MemoryUtils.hip
 create mode 100644 resolve/hip/hipVectorKernels.h
 create mode 100644 resolve/hip/hipVectorKernels.hip
 create mode 100644 resolve/hip/hip_check_errors.hpp
 create mode 100644 tests/unit/memory/CMakeLists.txt
 create mode 100644 tests/unit/memory/MemoryUtilsTests.hpp
 create mode 100644 tests/unit/memory/runMemoryUtilsTests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 13d65cfa..9f802231 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,14 +23,17 @@ endif()
 
 option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF)
 option(RESOLVE_USE_KLU  "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON)
-option(RESOLVE_USE_GPU  "Use GPU device for computations" ON)
-option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON)
+option(RESOLVE_USE_GPU  "Use GPU device for computations" OFF)
+option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF)
+option(RESOLVE_USE_HIP  "Use HIP language and ROCm library" OFF)
 set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
 
 if(RESOLVE_USE_CUDA)
-  set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE)
-else()
-  set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE)
+  set(RESOLVE_USE_GPU On CACHE BOOL "Using CUDA GPU!" FORCE)
+endif()
+
+if(RESOLVE_USE_HIP)
+  set(RESOLVE_USE_GPU On CACHE BOOL "Using HIP GPU!" FORCE)
 endif()
 
 
@@ -89,6 +92,20 @@ else()
   message(STATUS "Not using CUDA")
 endif()
 
+if(RESOLVE_USE_HIP)
+  enable_language(HIP)
+  check_language(HIP)
+  include(ReSolveFindHipLibraries)
+
+  # This is just an agly hack to make HIP build work
+  get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
+  message(STATUS "HIP include directories: ${hip_includes}")
+  include_directories(${hip_includes})
+else()
+  message(STATUS "Not using HIP")
+endif(RESOLVE_USE_HIP)
+
+
 # The binary dir is already a global include directory
 configure_file(
   ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in
diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
new file mode 100644
index 00000000..83b7c220
--- /dev/null
+++ b/cmake/ReSolveFindHipLibraries.cmake
@@ -0,0 +1,21 @@
+# Exports target `resolve_hip` which finds all hip libraries needed by resolve.
+
+
+add_library(resolve_hip INTERFACE)
+
+find_package(hip REQUIRED)
+find_package(hipblas REQUIRED)
+
+target_link_libraries(resolve_hip INTERFACE
+  #hip::host 
+  hip::device
+  #roc::hipblas
+)
+
+# get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
+# message(STATUS "HIP include directories: ${hip_includes}")
+
+# get_target_property(resolve_hip_includes resolve_hip INTERFACE_INCLUDE_DIRECTORIES)
+# message(STATUS "ReSolve HIP include directories: ${resolve_hip_includes}")
+
+install(TARGETS resolve_hip EXPORT ReSolveTargets)
diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index e2cbfde4..e7b19f4e 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -41,8 +41,8 @@ int main(int argc, char *argv[])
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
-  real_type* rhs;
-  real_type* x;
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
 
   vector_type* vec_rhs;
   vector_type* vec_x;
@@ -159,7 +159,8 @@ int main(int argc, char *argv[])
   delete A;
   delete KLU;
   delete GLU;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete workspace_CUDA;
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index 7d1bb141..ee99f0a0 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -44,8 +44,8 @@ int main(int argc, char *argv[])
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
-  real_type* rhs;
-  real_type* x;
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
 
   vector_type* vec_rhs;
   vector_type* vec_x;
@@ -170,7 +170,8 @@ int main(int argc, char *argv[])
     delete A;
     delete KLU;
     delete GLU;
-    delete x;
+    delete [] x;
+    delete [] rhs;
     delete vec_r;
     delete vec_x;
     delete workspace_CUDA;
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index 8b0ea59a..b9328e8a 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -40,8 +40,8 @@ int main(int argc, char *argv[])
   ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
   ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
   ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
-  real_type* rhs;
-  real_type* x;
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
 
   vector_type* vec_rhs;
   vector_type* vec_x;
@@ -148,7 +148,8 @@ int main(int argc, char *argv[])
   //now DELETE
   delete A;
   delete KLU;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete matrix_handler;
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index 77e5b97a..0b8f6114 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -36,8 +36,8 @@ int main(int argc, char *argv[])
   ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu();
   ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace);
   ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace);
-  real_type* rhs;
-  real_type* x;
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
 
   vector_type* vec_rhs;
   vector_type* vec_x;
@@ -111,7 +111,8 @@ int main(int argc, char *argv[])
   //now DELETE
   delete A;
   delete KLU;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete matrix_handler;
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 01fa0f3c..7369af18 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -42,8 +42,8 @@ int main(int argc, char *argv[] )
   workspace_CUDA->initializeHandles();
   ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_CUDA);
   ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_CUDA);
-  real_type* rhs;
-  real_type* x;
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
 
   vector_type* vec_rhs;
   vector_type* vec_x;
@@ -173,7 +173,8 @@ int main(int argc, char *argv[] )
   delete A;
   delete KLU;
   delete Rf;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete workspace_CUDA;
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index ee674869..07839cbb 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -189,5 +189,8 @@ int main(int argc, char *argv[])
 
   } // for (int i = 0; i < numSystems; ++i)
 
+  delete [] x;
+  delete [] rhs;
+
   return 0;
 }
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index 6a520a7a..56ab43fe 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -217,5 +217,8 @@ int main(int argc, char *argv[])
 
   }
 
+  delete [] x;
+  delete [] rhs;
+
   return 0;
 }
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 8dbcc467..fa6c9cd5 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -48,6 +48,10 @@ if(RESOLVE_USE_CUDA)
     add_subdirectory(cuda)
 endif()
 
+if(RESOLVE_USE_HIP)
+    add_subdirectory(hip)
+endif()
+
 # Now, build workspaces
 add_subdirectory(workspace)
 
@@ -67,6 +71,10 @@ if(RESOLVE_USE_CUDA)
     target_link_libraries(resolve_tpl INTERFACE resolve_cuda)
 endif(RESOLVE_USE_CUDA)
 
+if(RESOLVE_USE_HIP)
+    target_link_libraries(resolve_tpl INTERFACE resolve_hip)
+endif(RESOLVE_USE_HIP)
+
 
 set(ReSolve_Targets_List
     resolve_matrix
@@ -82,6 +90,11 @@ if(RESOLVE_USE_CUDA)
   set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
 endif()
 
+# If HIP support is enabled add HIP SDK specific code and dependencies
+if(RESOLVE_USE_HIP)
+  set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip)
+endif()
+
 # If no GPU support is enabled, link to dummy device backend
 if(NOT RESOLVE_USE_GPU)
     set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
index 00f3d653..976279d9 100644
--- a/resolve/MemoryUtils.hpp
+++ b/resolve/MemoryUtils.hpp
@@ -55,7 +55,8 @@ namespace ReSolve
 #include <resolve/cuda/CudaMemory.hpp>
 using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Cuda>;
 #elif defined RESOLVE_USE_HIP
-#error HIP support requested, but not available! Probably a bug in CMake configuration.
+#include <resolve/hip/HipMemory.hpp>
+using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Hip>;
 #else
 #error Unrecognized device, probably bug in CMake configuration
 #endif
diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt
new file mode 100644
index 00000000..f0a93b04
--- /dev/null
+++ b/resolve/hip/CMakeLists.txt
@@ -0,0 +1,37 @@
+#[[
+
+@brief Build ReSolve HIP backend
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+set(ReSolve_HIP_SRC
+    # hipKernels.cu
+    hipVectorKernels.hip
+    MemoryUtils.hip
+)
+
+set(ReSolve_HIP_HEADER_INSTALL
+    # hipKernels.h
+    # hipVectorKernels.h
+    HipMemory.hpp
+    # hip_check_errors.hpp
+)
+
+set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP)
+
+# First create HIP backend 
+# (this should really be HIP _API_ backend, 
+# separate backend will be needed for HIP SDK)
+add_library(resolve_backend_hip SHARED ${ReSolve_HIP_SRC})
+target_link_libraries(resolve_backend_hip PRIVATE resolve_logger)
+target_link_libraries(resolve_backend_hip PUBLIC resolve_hip)
+#target_include_directories(resolve_backend_hip PUBLIC ${hip_includes})
+target_include_directories(resolve_backend_hip INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+# install include headers
+install(FILES ${ReSolve_HIP_HEADER_INSTALL} DESTINATION include/resolve/hip)
diff --git a/resolve/hip/HipMemory.hpp b/resolve/hip/HipMemory.hpp
new file mode 100644
index 00000000..a6a482a5
--- /dev/null
+++ b/resolve/hip/HipMemory.hpp
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <iostream>
+#include <hip/hip_runtime.h>
+
+#include "hip_check_errors.hpp"
+
+namespace ReSolve
+{
+  namespace memory
+  {
+    /**
+     * @brief Class containing wrappers for CUDA API functions.
+     * 
+     * All wrappers are implemented as static functions returning integer
+     * error code from CUDA API functions.
+     * 
+     * @author Slaven Peles <peless@ornl.gov>
+     */
+    struct Hip
+    {
+      static void deviceSynchronize()
+      {
+        hipDeviceSynchronize();
+      }
+      
+      static int getLastDeviceError()
+      {
+        return static_cast<int>(hipGetLastError());
+      }
+      
+      /** 
+       * @brief deletes variable from device
+       *
+       * @param v - a variable on the device
+       *
+       * @post v is freed from the device
+       */
+      static int deleteOnDevice(void* v)
+      {
+        return checkHipErrors(hipFree(v));
+      }
+
+      /**
+       * @brief allocates array v onto device
+       *
+       * @param v - pointer to the array to be allocated on the device
+       * @param n - number of array elements (int, size_t)
+       * 
+       * @tparam T - Array element type
+       * @tparam I - Array index type
+       *
+       * @post v is now a array with size n on the device
+       */
+      template <typename I, typename T>
+      static int allocateArrayOnDevice(T** v, I n)
+      {
+        return checkHipErrors(hipMalloc((void**) v, sizeof(T) * n));
+      }
+
+      /**
+       * @brief allocates buffer v onto device.
+       * 
+       * The difference from the array is that buffer size is required in bytes,
+       * not number of elements.
+       *
+       * @param v - pointer to the buffer to be allocated on the device
+       * @param n - size of the buffer in bytes
+       * 
+       * @tparam T - Buffer element data type type (typically void)
+       * @tparam I - Buffer size type (typically size_t)
+       *
+       * @post v is now a buffer of n bytes
+       */
+      template <typename I, typename T>
+      static int allocateBufferOnDevice(T** v, I n)
+      {
+        return checkHipErrors(hipMalloc((void**) v, n));
+      }
+
+      /**
+       * @brief Sets elements of device array v to zero
+       *
+       * @param v - pointer to the array to be allocated on the device
+       * @param n - number of the array elements to be set to zero
+       * 
+       * @tparam T - Array element type
+       * @tparam I - Array index type
+       *
+       * @post First n elements of array v are set to zero
+       */
+      template <typename I, typename T>
+      static int setZeroArrayOnDevice(T* v, I n)
+      {
+        return checkHipErrors(hipMemset(v, 0, sizeof(T) * n));
+      }
+
+      /** 
+       * @brief Copies array `src` from device to the array `dst` on the host.
+       *
+       * @param[in]    n - size of src array
+       * @param[in]  src - array on device
+       * @param[out] dst - array on host
+       *
+       * @pre `src` is a pointer to an allocated array on the device
+       * @pre `dst` is allocated to size >= n on the host
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToHost(T* dst, const T* src, I n)
+      {
+        return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToHost));
+      }
+
+      /**
+       * @brief Copies array `src` to the array `dst` on the device.
+       *
+       * @param n - size of src array
+       * @param src - array on device to be copied
+       * @param dst - array on device to be copied onto
+       *
+       * @pre `src` is a pointer to an allocated array on the device
+       * @pre `dst` is allocated to size >= n on the device
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToDevice(T* dst, const T* src, I n)
+      {
+        return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToDevice));
+      }
+
+      /**
+       * @brief Copies array `src` from the host to the array `dst` on the device.
+       *
+       * @param n - size of src array
+       * @param src - array on the host to be copied
+       * @param dst - array on the device to be copied onto
+       *
+       * @pre `src` is a pointer to an allocated array on the host
+       * @pre `dst` is allocated to size >= n on the device
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayHostToDevice(T* dst, const T* src, I n)
+      {
+        return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyHostToDevice));
+      }
+
+    };
+  }
+
+} //namespace ReSolve
diff --git a/resolve/hip/MemoryUtils.hip b/resolve/hip/MemoryUtils.hip
new file mode 100644
index 00000000..bd3c666d
--- /dev/null
+++ b/resolve/hip/MemoryUtils.hip
@@ -0,0 +1,40 @@
+/**
+ * @file MemoryUtils.cu
+ * 
+ * This file includes MemoryUtils.tpp and specifies what functions to
+ * instantiate from function templates.
+ * 
+ * @author Slaven Peles <peless@ornl.gov>
+ */
+
+
+#include <iostream>
+
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+
+#include <resolve/MemoryUtils.tpp>
+
+namespace ReSolve
+{
+  template void MemoryUtils<memory::Hip>::deviceSynchronize();
+  template int MemoryUtils<memory::Hip>::getLastDeviceError();
+  template int MemoryUtils<memory::Hip>::deleteOnDevice(void*);
+
+  template int MemoryUtils<memory::Hip>::allocateArrayOnDevice<index_type,  real_type>( real_type**, index_type);
+  template int MemoryUtils<memory::Hip>::allocateArrayOnDevice<index_type, index_type>(index_type**, index_type);
+
+  template int MemoryUtils<memory::Hip>::allocateBufferOnDevice<size_t, void>(void** v, size_t n);
+
+  template int MemoryUtils<memory::Hip>::setZeroArrayOnDevice<index_type, real_type>( real_type*, index_type);
+
+  template int MemoryUtils<memory::Hip>::copyArrayDeviceToHost<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Hip>::copyArrayDeviceToHost<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int MemoryUtils<memory::Hip>::copyArrayDeviceToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Hip>::copyArrayDeviceToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int MemoryUtils<memory::Hip>::copyArrayHostToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Hip>::copyArrayHostToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+
+} //namespace ReSolve
diff --git a/resolve/hip/hipVectorKernels.h b/resolve/hip/hipVectorKernels.h
new file mode 100644
index 00000000..cd23f822
--- /dev/null
+++ b/resolve/hip/hipVectorKernels.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+#include <resolve/Common.hpp>
+
+//***************************************************************************//
+//**** See VectorKernels.hpp for kernel wrapper functions documentation  ****//
+//***************************************************************************//
+
+namespace ReSolve { namespace vector {
+
+namespace kernels {
+  // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
+  //    index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*);
+
+  // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
+  //    index_type*, real_type*, index_type*, index_type*, real_type*);
+
+  // __global__ void add_const(index_type, index_type, index_type*);
+
+  /**
+   * @brief CUDA kernel that sets values of an array to a constant.
+   *
+   * @param[in]  n   - length of the array
+   * @param[in]  val - the value the array is set to
+   * @param[out] arr - a pointer to the array
+   * 
+   * @pre  `arr` is allocated to size `n`
+   * @post `arr` elements are set to `val`
+   */
+  __global__ void set_const(index_type n, real_type val, real_type* arr);
+
+  // __global__ void add_vecs(index_type, real_type*, real_type, real_type*);
+
+  // __global__ void mult_const(index_type, real_type, real_type*);
+
+  // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*);
+
+  // __global__ void inv_vec_scale(index_type, real_type*, real_type*);
+
+  // __global__ void vec_scale(index_type, real_type*, real_type*);
+
+  // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*,
+  //   real_type*, index_type*, index_type*, real_type*, index_type*, index_type*);
+
+  // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*,
+  //     real_type*, real_type*);
+
+  // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*,
+  //   index_type*, real_type*, real_type*, real_type*, index_type);
+
+  // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*,
+  //    real_type* scale);
+} // namespace kernels
+
+}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip
new file mode 100644
index 00000000..3df2b84b
--- /dev/null
+++ b/resolve/hip/hipVectorKernels.hip
@@ -0,0 +1,29 @@
+#include <resolve/Common.hpp>
+#include <resolve/vector/VectorKernels.hpp>
+
+#include "hipVectorKernels.h"
+
+namespace ReSolve { namespace vector {
+
+namespace kernels {
+
+__global__ void set_const(index_type n, real_type val, real_type* arr)
+{
+  index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i < n)
+  {
+    arr[i] = val;
+  }
+}
+
+} // namespace kernels
+
+void set_array_const(index_type n, real_type val, real_type* arr)
+{
+  index_type num_blocks;
+  index_type block_size = 512;
+  num_blocks = (n + block_size - 1) / block_size;
+  kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
+}
+
+}} // namespace ReSolve::vector
\ No newline at end of file
diff --git a/resolve/hip/hip_check_errors.hpp b/resolve/hip/hip_check_errors.hpp
new file mode 100644
index 00000000..1f483d35
--- /dev/null
+++ b/resolve/hip/hip_check_errors.hpp
@@ -0,0 +1,28 @@
+/**
+ * @file hip_check_errors.hpp
+ * 
+ * Contains macro to get error code from CUDA functions and to stream
+ * appropriate error output to Re::Solve's logger.
+ * 
+ * @author Kasia Swirydowicz <kasia.swirydowicz@pnnl.gov>
+ * @author Slaven Peles <peless@ornl.gov>
+ */
+#pragma once
+
+#include <resolve/utilities/logger/Logger.hpp>
+
+template <typename T>
+int  check(T result, 
+           char const *const func, 
+           const char *const file,
+           int const line) 
+{
+  if (result) {
+    ReSolve::io::Logger::error() << "HIP error in function "
+                                 << func << " at " << file << ":" << line 
+                                 << ", error# " << result << "\n";
+    return -1;
+  }
+  return 0;
+}
+#define checkHipErrors(val) check((val), #val, __FILE__, __LINE__)
diff --git a/resolve/resolve_defs.hpp.in b/resolve/resolve_defs.hpp.in
index 9756376c..15cd5791 100644
--- a/resolve/resolve_defs.hpp.in
+++ b/resolve/resolve_defs.hpp.in
@@ -1,4 +1,7 @@
-#pragma once
+// #pragma once
+
+#ifndef __RESOLVE_DEFINITIONS_HPP__
+#define __RESOLVE_DEFINITIONS_HPP__
 
 #cmakedefine RESOLVE_USE_GPU
 #cmakedefine RESOLVE_USE_CUDA
@@ -14,3 +17,12 @@
 
 // /// Date of build with the format "%Y-%m-%d"
 // #define RESOLVE_RELEASE_DATE "@RESOLVE_RELEASE_DATE@"
+
+#ifdef RESOLVE_USE_HIP
+#ifndef __HIP_PLATFORM_AMD__
+#define __HIP_PLATFORM_AMD__
+#endif
+#endif
+
+
+#endif // __RESOLVE_DEFINITIONS_HPP__
\ No newline at end of file
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index f3c1da57..b067f417 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -66,7 +66,7 @@ int main(int argc, char *argv[])
     return -1;
   }
   real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
-  real_type* x = new real_type[A->getNumRows()];
+  real_type* x   = new real_type[A->getNumRows()];
   vector_type* vec_rhs = new vector_type(A->getNumRows());
   vector_type* vec_x   = new vector_type(A->getNumRows());
   vector_type* vec_r   = new vector_type(A->getNumRows());
@@ -215,7 +215,8 @@ int main(int argc, char *argv[])
   //now DELETE
   delete A;
   delete KLU;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete matrix_handler;
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index 0e9bb4bd..ddaf3b31 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -72,7 +72,7 @@ int main(int argc, char *argv[])
     return -1;
   }
   real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
-  real_type* x = new real_type[A->getNumRows()];
+  real_type* x   = new real_type[A->getNumRows()];
   vector_type* vec_rhs = new vector_type(A->getNumRows());
   vector_type* vec_x   = new vector_type(A->getNumRows());
   vec_x->allocate("cpu");//for KLU
@@ -239,7 +239,8 @@ int main(int argc, char *argv[])
   delete A;
   delete KLU;
   delete GLU;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete workspace_CUDA;
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 729968f5..124f07de 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -72,7 +72,7 @@ int main(int argc, char *argv[])
     return -1;
   }
   real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
-  real_type* x = new real_type[A->getNumRows()];
+  real_type* x   = new real_type[A->getNumRows()];
   vector_type* vec_rhs = new vector_type(A->getNumRows());
   vector_type* vec_x   = new vector_type(A->getNumRows());
   vector_type* vec_r   = new vector_type(A->getNumRows());
@@ -243,7 +243,8 @@ int main(int argc, char *argv[])
   delete A;
   delete KLU;
   delete Rf;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete workspace_CUDA;
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index a474e406..6601a3ee 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -77,7 +77,7 @@ int main(int argc, char *argv[])
     return -1;
   }
   real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
-  real_type* x = new real_type[A->getNumRows()];
+  real_type* x   = new real_type[A->getNumRows()];
   vector_type* vec_rhs = new vector_type(A->getNumRows());
   vector_type* vec_x   = new vector_type(A->getNumRows());
   vector_type* vec_r   = new vector_type(A->getNumRows());
@@ -264,7 +264,8 @@ int main(int argc, char *argv[])
   delete GS;
   delete FGMRES;
   delete Rf;
-  delete x;
+  delete [] x;
+  delete [] rhs;
   delete vec_r;
   delete vec_x;
   delete workspace_CUDA;
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index f91c2ff7..a8586342 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -9,3 +9,4 @@
 add_subdirectory(matrix)
 add_subdirectory(vector)
 add_subdirectory(utilities)
+add_subdirectory(memory)
diff --git a/tests/unit/memory/CMakeLists.txt b/tests/unit/memory/CMakeLists.txt
new file mode 100644
index 00000000..01313e33
--- /dev/null
+++ b/tests/unit/memory/CMakeLists.txt
@@ -0,0 +1,21 @@
+#[[
+
+@brief Build ReSolve memory utilities unit tests
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+# Build memory utilities tests
+add_executable(runMemoryUtilsTests.exe runMemoryUtilsTests.cpp)
+target_link_libraries(runMemoryUtilsTests.exe PRIVATE ReSolve)
+message(STATUS "Resolve libraries: ${resolve_backend_hip}")
+
+
+# Install tests
+set(installable_tests runMemoryUtilsTests.exe)
+install(TARGETS ${installable_tests} 
+        RUNTIME DESTINATION bin/resolve/tests/unit)
+
+# Add tests to run
+add_test(NAME memory_test COMMAND $<TARGET_FILE:runMemoryUtilsTests.exe>)
diff --git a/tests/unit/memory/MemoryUtilsTests.hpp b/tests/unit/memory/MemoryUtilsTests.hpp
new file mode 100644
index 00000000..4cc1ace8
--- /dev/null
+++ b/tests/unit/memory/MemoryUtilsTests.hpp
@@ -0,0 +1,110 @@
+#pragma once
+#include <string>
+#include <vector>
+#include <sstream>
+#include <iterator>
+#include <algorithm>
+#include <resolve/MemoryUtils.hpp>
+#include <tests/unit/TestBase.hpp>
+
+namespace ReSolve { namespace tests {
+
+/**
+ * @class Unit tests for memory handler class
+ */
+class MemoryUtilsTests : TestBase
+{
+public:
+  MemoryUtilsTests(std::string memspace) : memspace_(memspace) 
+  {}
+  virtual ~MemoryUtilsTests()
+  {}
+
+  TestOutcome allocateAndDelete()
+  {
+    TestStatus status;
+    status = true;
+
+    MemoryHandler mh;
+
+    index_type n = 1000;
+    size_t     m = 8000;
+    index_type* i = nullptr;
+    real_type*  r = nullptr;
+
+    mh.allocateArrayOnDevice(&i, n);
+    mh.allocateBufferOnDevice((void**) &r, m);
+
+    status *= (i != nullptr);
+    status *= (r != nullptr);
+
+    mh.deleteOnDevice(i);
+    mh.deleteOnDevice(r);
+
+    return status.report(__func__);
+  }
+
+  TestOutcome memsetAndMemcpy()
+  {
+    TestStatus status;
+    status = true;
+
+    MemoryHandler mh;
+
+    index_type n = 10;
+
+    real_type zero = 0.0;
+    real_type minusone = -1.0;
+
+    // Create raw arrays on the host and set their elements to -1
+    real_type* array1 = new real_type[n]{0};
+    real_type* array2 = new real_type[n]{0};
+    std::fill_n(array1, n, minusone);
+    std::fill_n(array2, n, minusone);
+
+    // Allocate arrays of size n on the device
+    real_type* devarray1 = nullptr;
+    real_type* devarray2 = nullptr;
+    mh.allocateArrayOnDevice(&devarray1, n);
+    mh.allocateArrayOnDevice(&devarray2, n);
+
+    // Set devarray1 elements to 0 and copy it to array1
+    mh.setZeroArrayOnDevice(devarray1, n);
+    mh.copyArrayDeviceToHost(array1, devarray1, n);
+    status *= verifyAnswer(array1, zero, n);
+
+    // Copy array2 (values -1) to devarray2 and then devarray2 to array1
+    mh.copyArrayHostToDevice(devarray2, array2, n);
+    mh.copyArrayDeviceToHost(array1, devarray2, n);
+    status *= verifyAnswer(array1, minusone, n);
+
+    // Copy devarray1 (values 0) to devarray2 and then to array2
+    mh.copyArrayDeviceToDevice(devarray2, devarray1, n);
+    mh.copyArrayDeviceToHost(array2, devarray2, n);
+    status *= verifyAnswer(array2, zero, n);
+
+    return status.report(__func__);
+  }
+
+
+private:
+  std::string memspace_{"cpu"};
+
+  bool verifyAnswer(real_type* x, real_type answer, index_type n)
+  {
+    bool status = true;
+
+    for (index_type i = 0; i < n; ++i) {
+      if (!isEqual(x[i], answer)) {
+        status = false;
+        std::cout << "Solution vector element x[" << i << "] = " << x[i]
+                  << ", expected: " << answer << "\n";
+        break; 
+      }
+    }
+    return status;
+  }
+
+}; // class MemoryUtilsTests
+
+}} // namespace ReSolve::tests
diff --git a/tests/unit/memory/runMemoryUtilsTests.cpp b/tests/unit/memory/runMemoryUtilsTests.cpp
new file mode 100644
index 00000000..00349c7c
--- /dev/null
+++ b/tests/unit/memory/runMemoryUtilsTests.cpp
@@ -0,0 +1,36 @@
+#include <string>
+#include <iostream>
+#include <fstream>
+
+#include "MemoryUtilsTests.hpp"
+
+int main(int, char**)
+{
+  ReSolve::tests::TestingResults result; 
+
+#ifdef RESOLVE_USE_HIP
+  {
+    std::cout << "Running memory tests with HIP backend:\n";
+    ReSolve::tests::MemoryUtilsTests test("hip");
+
+    result += test.allocateAndDelete();
+    result += test.memsetAndMemcpy();
+
+    std::cout << "\n";
+  }
+#endif
+
+#ifdef RESOLVE_USE_CUDA
+  {
+    std::cout << "Running memory tests with CUDA backend:\n";
+    ReSolve::tests::MemoryUtilsTests test("hip");
+
+    result += test.allocateAndDelete();
+    result += test.memsetAndMemcpy();
+
+    std::cout << "\n";
+  }
+#endif
+
+  return result.summary();
+}

From 3b1b78216bb9bf4579d9166bcd19bdeb45c9842b Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Fri, 27 Oct 2023 21:56:34 -0400
Subject: [PATCH 02/12] First stab at hip linear algebra. (#39)

Co-authored-by: kswirydo <kasia.swirydowicz@gmail.com>
---
 cmake/ReSolveFindHipLibraries.cmake         |   2 +
 resolve/hip/CMakeLists.txt                  |   4 +-
 resolve/hip/hipKernels.h                    |  14 ++
 resolve/hip/hipKernels.hip                  | 167 ++++++++++++++
 resolve/hip/hipVectorKernels.hip            |  29 ++-
 resolve/matrix/CMakeLists.txt               |  13 ++
 resolve/matrix/Coo.cpp                      |  30 +--
 resolve/matrix/Csc.cpp                      |  30 +--
 resolve/matrix/Csr.cpp                      |  30 +--
 resolve/matrix/MatrixHandler.cpp            |  30 +++
 resolve/matrix/MatrixHandler.hpp            |   4 +
 resolve/matrix/MatrixHandlerHip.cpp         | 154 +++++++++++++
 resolve/matrix/MatrixHandlerHip.hpp         |  60 +++++
 resolve/vector/CMakeLists.txt               |  16 ++
 resolve/vector/Vector.cpp                   |  48 ++--
 resolve/vector/VectorHandler.cpp            |  52 ++++-
 resolve/vector/VectorHandler.hpp            |   4 +
 resolve/vector/VectorHandlerHip.cpp         | 236 ++++++++++++++++++++
 resolve/vector/VectorHandlerHip.hpp         |  57 +++++
 resolve/workspace/CMakeLists.txt            |  13 ++
 resolve/workspace/LinAlgWorkspace.hpp       |   4 +
 resolve/workspace/LinAlgWorkspaceHIP.cpp    |  75 +++++++
 resolve/workspace/LinAlgWorkspaceHIP.hpp    |  52 +++++
 tests/unit/matrix/CMakeLists.txt            |   2 +-
 tests/unit/matrix/MatrixHandlerTests.hpp    |   9 +-
 tests/unit/matrix/runMatrixHandlerTests.cpp |  12 +
 tests/unit/vector/VectorHandlerTests.hpp    |  12 +-
 tests/unit/vector/runVectorHandlerTests.cpp |  17 ++
 28 files changed, 1077 insertions(+), 99 deletions(-)
 create mode 100644 resolve/hip/hipKernels.h
 create mode 100644 resolve/hip/hipKernels.hip
 create mode 100644 resolve/matrix/MatrixHandlerHip.cpp
 create mode 100644 resolve/matrix/MatrixHandlerHip.hpp
 create mode 100644 resolve/vector/VectorHandlerHip.cpp
 create mode 100644 resolve/vector/VectorHandlerHip.hpp
 create mode 100644 resolve/workspace/LinAlgWorkspaceHIP.cpp
 create mode 100644 resolve/workspace/LinAlgWorkspaceHIP.hpp

diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
index 83b7c220..e754da0d 100644
--- a/cmake/ReSolveFindHipLibraries.cmake
+++ b/cmake/ReSolveFindHipLibraries.cmake
@@ -9,6 +9,8 @@ find_package(hipblas REQUIRED)
 target_link_libraries(resolve_hip INTERFACE
   #hip::host 
   hip::device
+  rocblas
+  rocsparse
   #roc::hipblas
 )
 
diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt
index f0a93b04..f8d7a457 100644
--- a/resolve/hip/CMakeLists.txt
+++ b/resolve/hip/CMakeLists.txt
@@ -7,14 +7,14 @@
 ]]
 
 set(ReSolve_HIP_SRC
-    # hipKernels.cu
+    hipKernels.hip
     hipVectorKernels.hip
     MemoryUtils.hip
 )
 
 set(ReSolve_HIP_HEADER_INSTALL
     # hipKernels.h
-    # hipVectorKernels.h
+    hipVectorKernels.h
     HipMemory.hpp
     # hip_check_errors.hpp
 )
diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h
new file mode 100644
index 00000000..9c48783a
--- /dev/null
+++ b/resolve/hip/hipKernels.h
@@ -0,0 +1,14 @@
+void mass_inner_product_two_vectors(int n, 
+                                    int i, 
+                                    double* vec1, 
+                                    double* vec2, 
+                                    double* mvec, 
+                                    double* result);
+void mass_axpy(int n, int i, double* x, double* y, double* alpha);
+
+//needed for matrix inf nrm
+void matrix_row_sums(int n, 
+                     int nnz, 
+                     int* a_ia,
+                     double* a_val, 
+                     double* result);
diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
new file mode 100644
index 00000000..13f53d85
--- /dev/null
+++ b/resolve/hip/hipKernels.hip
@@ -0,0 +1,167 @@
+#include "hipKernels.h"
+#define maxk 1024
+#define Tv5 1024
+
+#include <hip/hip_runtime.h>
+
+//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1
+__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, 
+                                    const double* __restrict__ u2, 
+                                    const double* __restrict__ v, 
+                                    double* result,
+                                    const int k, 
+                                    const int N)
+{
+  int t = threadIdx.x;
+  int bsize = blockDim.x;
+
+  // assume T threads per thread block (and k reductions to be performed)
+  volatile __shared__ double s_tmp1[Tv5];
+
+  volatile __shared__ double s_tmp2[Tv5];
+  // map between thread index space and the problem index space
+  int j = blockIdx.x;
+  s_tmp1[t] = 0.0f;
+  s_tmp2[t] = 0.0f;
+  int nn = t;
+  double can1, can2, cbn;
+
+  while(nn < N) {
+    can1 = u1[nn];
+    can2 = u2[nn];
+
+    cbn = v[N * j + nn];
+    s_tmp1[t] += can1 * cbn;
+    s_tmp2[t] += can2 * cbn;
+
+    nn += bsize;
+  }
+
+  __syncthreads();
+
+  if(Tv5 >= 1024) {
+    if(t < 512) {
+      s_tmp1[t] += s_tmp1[t + 512];
+      s_tmp2[t] += s_tmp2[t + 512];
+    }
+    __syncthreads();
+  }
+  if(Tv5 >= 512) {
+    if(t < 256) {
+      s_tmp1[t] += s_tmp1[t + 256];
+      s_tmp2[t] += s_tmp2[t + 256];
+    }
+    __syncthreads();
+  }
+  {
+    if(t < 128) {
+      s_tmp1[t] += s_tmp1[t + 128];
+      s_tmp2[t] += s_tmp2[t + 128];
+    }
+    __syncthreads();
+  }
+  {
+    if(t < 64) {
+      s_tmp1[t] += s_tmp1[t + 64];
+      s_tmp2[t] += s_tmp2[t + 64];
+    }
+    __syncthreads();
+  }
+
+  if(t < 32) {
+    s_tmp1[t] += s_tmp1[t + 32];
+    s_tmp2[t] += s_tmp2[t + 32];
+
+    s_tmp1[t] += s_tmp1[t + 16];
+    s_tmp2[t] += s_tmp2[t + 16];
+
+    s_tmp1[t] += s_tmp1[t + 8];
+    s_tmp2[t] += s_tmp2[t + 8];
+
+    s_tmp1[t] += s_tmp1[t + 4];
+    s_tmp2[t] += s_tmp2[t + 4];
+
+    s_tmp1[t] += s_tmp1[t + 2];
+    s_tmp2[t] += s_tmp2[t + 2];
+
+    s_tmp1[t] += s_tmp1[t + 1];
+    s_tmp2[t] += s_tmp2[t + 1];
+  }
+  if(t == 0) {
+    result[blockIdx.x] = s_tmp1[0];
+    result[blockIdx.x + k] = s_tmp2[0];
+  }
+}
+
+
+//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES
+
+__global__ void massAxpy3_kernel(int N,
+                                 int k,
+                                 const double* x_data,
+                                 double* y_data,
+                                 const double* alpha) {
+
+  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+  unsigned int t = threadIdx.x;
+
+  __shared__ double s_alpha[maxk];
+  if(t < k) {
+    s_alpha[t] = alpha[t];
+  }
+  __syncthreads();
+  while (i < N){
+    double temp = 0.0;
+    for(int j = 0; j < k; ++j) {
+      temp += x_data[j * N + i] * s_alpha[j];
+    }
+    y_data[i] -= temp;
+    i += (blockDim.x*gridDim.x);
+  }
+}
+__global__ void matrixInfNormPart1(const int n, 
+                                   const int nnz, 
+                                   const int* a_ia,
+                                   const double* a_val, 
+                                   double* result) {
+
+  // one thread per row, pass through rows
+  // and sum
+  // can be done through atomics
+  //\sum_{j=1}^m abs(a_{ij})
+
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  while (idx < n){
+    double sum = 0.0f;
+    for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) {
+      sum = sum + fabs(a_val[i]);
+    }
+    result[idx] = sum;
+    idx += (blockDim.x*gridDim.x);
+  }
+}
+
+
+void mass_inner_product_two_vectors(int n, 
+                                    int i, 
+                                    double* vec1, 
+                                    double* vec2, 
+                                    double* mvec, 
+                                    double* result)
+{
+  hipLaunchKernelGGL(MassIPTwoVec_kernel, dim3(i + 1), dim3(1024), 0, 0, vec1, vec2, mvec, result, i + 1, n);
+}
+void mass_axpy(int n, int i, double* x, double* y, double* alpha)
+{
+  hipLaunchKernelGGL(massAxpy3_kernel, dim3((n + 384 - 1) / 384), dim3(384), 0, 0, n, i, x, y, alpha);
+}
+
+void matrix_row_sums(int n, 
+                     int nnz, 
+                     int* a_ia,
+                     double* a_val, 
+                     double* result)
+{
+  hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
+}
diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip
index 3df2b84b..f68cd0b9 100644
--- a/resolve/hip/hipVectorKernels.hip
+++ b/resolve/hip/hipVectorKernels.hip
@@ -1,29 +1,28 @@
 #include <resolve/Common.hpp>
 #include <resolve/vector/VectorKernels.hpp>
-
-#include "hipVectorKernels.h"
+#include <hip/hip_runtime.h>
 
 namespace ReSolve { namespace vector {
 
 namespace kernels {
 
-__global__ void set_const(index_type n, real_type val, real_type* arr)
-{
-  index_type i = blockIdx.x * blockDim.x + threadIdx.x;
-  if(i < n)
+  __global__ void set_const(index_type n, real_type val, real_type* arr)
   {
-    arr[i] = val;
+    index_type i = blockIdx.x * blockDim.x + threadIdx.x;
+    while (i < n)
+    {
+      arr[i] = val;
+      i += blockDim.x * gridDim.x;
+    }
   }
-}
-
 } // namespace kernels
 
-void set_array_const(index_type n, real_type val, real_type* arr)
+void set_array_const(index_type  n, real_type val, real_type* arr)
 {
-  index_type num_blocks;
-  index_type block_size = 512;
-  num_blocks = (n + block_size - 1) / block_size;
-  kernels::set_const<<<num_blocks, block_size>>>(n, val, arr);
+   index_type num_blocks;
+   index_type block_size = 512;
+   num_blocks = (n + block_size - 1) / block_size;
+   hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr);
 }
 
-}} // namespace ReSolve::vector
\ No newline at end of file
+}} // namespace ReSolve::vector
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 554c0ba7..565fa7c9 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC
     MatrixHandlerCuda.cpp
 )
 
+# and on HIP
+set(Matrix_ROCM_SRC 
+  MatrixHandlerHip.cpp
+)
+
 # Header files to be installed
 set(Matrix_HEADER_INSTALL
     io.hpp
@@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA)
   set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC})
 endif()
 
+if(RESOLVE_USE_HIP)
+  set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC})
+endif()
+
 
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
@@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA)
   target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
 endif()
 
+if (RESOLVE_USE_HIP)
+  target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip)
+endif()
+
 # Link to dummy device backend if GPU support is not enabled
 if (NOT RESOLVE_USE_GPU)
   target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp
index c8caebf6..a91f94a9 100644
--- a/resolve/matrix/Coo.cpp
+++ b/resolve/matrix/Coo.cpp
@@ -33,8 +33,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_row_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_row_data_;
       } else {
         return nullptr;
@@ -48,8 +48,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_col_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_col_data_;
       } else {
         return nullptr;
@@ -63,8 +63,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_val_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_val_data_;
       } else {
         return nullptr;
@@ -81,9 +81,9 @@ namespace ReSolve
     setNotUpdated();
     int control=-1;
     if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
 
     if (memspaceOut == "cpu") {
       //check if cpu data allocated	
@@ -98,7 +98,7 @@ namespace ReSolve
       }
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
@@ -120,7 +120,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 2://cuda->cpu
+      case 2://gpu->cpu
         mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
@@ -128,7 +128,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 1://cpu->cuda
+      case 1://cpu->gpu
         mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
@@ -136,7 +136,7 @@ namespace ReSolve
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
-      case 3://cuda->cuda
+      case 3://gpu->gpua
         mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
@@ -176,7 +176,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspace == "cuda") {
+    if ((memspace == "cuda") || (memspace == "hip")) {
       mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -215,7 +215,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_row_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp
index 1a305e03..e2ea765f 100644
--- a/resolve/matrix/Csc.cpp
+++ b/resolve/matrix/Csc.cpp
@@ -30,8 +30,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_row_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_row_data_;
       } else {
         return nullptr;
@@ -45,8 +45,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_col_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_col_data_;
       } else {
         return nullptr;
@@ -60,8 +60,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_val_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_val_data_;
       } else {
         return nullptr;
@@ -77,9 +77,9 @@ namespace ReSolve
     int control=-1;
     setNotUpdated();
     if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
 
     if (memspaceOut == "cpu") {
       //check if cpu data allocated
@@ -94,7 +94,7 @@ namespace ReSolve
       }
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       //check if cuda data allocated
       if (d_col_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
@@ -116,7 +116,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 2://cuda->cpu
+      case 2://gpu->cpu
         mem_.copyArrayDeviceToHost(h_col_data_, col_data,      n_ + 1);
         mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
@@ -124,7 +124,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 1://cpu->cuda
+      case 1://cpu->gpu
         mem_.copyArrayHostToDevice(d_col_data_, col_data,      n_ + 1);
         mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
@@ -132,7 +132,7 @@ namespace ReSolve
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
-      case 3://cuda->cuda
+      case 3://gpu->gpu
         mem_.copyArrayDeviceToDevice(d_col_data_, col_data,      n_ + 1);
         mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
@@ -173,7 +173,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspace == "cuda") {
+    if ((memspace == "cuda") || (memspace == "hip")) {
       mem_.allocateArrayOnDevice(&d_col_data_,      n_ + 1); 
       mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -212,7 +212,7 @@ namespace ReSolve
       return 0;   
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_col_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp
index f1ddd31f..dff33b48 100644
--- a/resolve/matrix/Csr.cpp
+++ b/resolve/matrix/Csr.cpp
@@ -30,8 +30,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_row_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_row_data_;
       } else {
         return nullptr;
@@ -45,8 +45,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_col_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_col_data_;
       } else {
         return nullptr;
@@ -60,8 +60,8 @@ namespace ReSolve
       copyData("cpu");
       return this->h_val_data_;
     } else {
-      if (memspace == "cuda") {
-        copyData("cuda");
+      if ((memspace == "cuda") || (memspace == "hip")) {
+        copyData(memspace);
         return this->d_val_data_;
       } else {
         return nullptr;
@@ -77,9 +77,9 @@ namespace ReSolve
     setNotUpdated();
     int control = -1;
     if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
 
     if (memspaceOut == "cpu") {
       //check if cpu data allocated
@@ -94,7 +94,7 @@ namespace ReSolve
       }
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
@@ -118,7 +118,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 2://cuda->cpu
+      case 2://gpu->cpu
         mem_.copyArrayDeviceToHost(h_row_data_, row_data,      n_ + 1);
         mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
@@ -126,7 +126,7 @@ namespace ReSolve
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
-      case 1://cpu->cuda
+      case 1://cpu->gpu
         mem_.copyArrayHostToDevice(d_row_data_, row_data,      n_ + 1);
         mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
@@ -134,7 +134,7 @@ namespace ReSolve
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
-      case 3://cuda->cuda
+      case 3://gpu->gpu
         mem_.copyArrayDeviceToDevice(d_row_data_, row_data,      n_ + 1);
         mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
         mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
@@ -174,7 +174,7 @@ namespace ReSolve
       return 0;   
     }
 
-    if (memspace == "cuda") {
+    if ((memspace == "cuda") || (memspace == "hip")) {
       mem_.allocateArrayOnDevice(&d_row_data_,      n_ + 1); 
       mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -212,7 +212,7 @@ namespace ReSolve
       return 0;
     }
 
-    if (memspaceOut == "cuda") {
+    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_row_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 8bf4302c..133a09f9 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -13,6 +13,9 @@
 #ifdef RESOLVE_USE_CUDA
 #include "MatrixHandlerCuda.hpp"
 #endif
+#ifdef RESOLVE_USE_HIP
+#include "MatrixHandlerHip.hpp"
+#endif
 
 namespace ReSolve {
   // Create a shortcut name for Logger static class
@@ -41,6 +44,7 @@ namespace ReSolve {
   {
     if (isCpuEnabled_)  delete cpuImpl_;
     if (isCudaEnabled_) delete cudaImpl_;
+    if (isHipEnabled_) delete hipImpl_;
   }
 
   /**
@@ -74,12 +78,31 @@ namespace ReSolve {
   }
 #endif
 
+#ifdef RESOLVE_USE_HIP
+  /**
+   * @brief Constructor taking pointer to the CUDA workspace as its parameter.
+   * 
+   * @post A CPU implementation instance is created because it is cheap and
+   * it does not require a workspace.
+   * 
+   * @post A HIP implementation instance is created with supplied workspace.
+   */
+  MatrixHandler::MatrixHandler(LinAlgWorkspaceHIP* new_workspace)
+  {
+    cpuImpl_  = new MatrixHandlerCpu();
+    hipImpl_ = new MatrixHandlerHip(new_workspace);
+    isCpuEnabled_  = true;
+    isHipEnabled_ = true;
+  }
+#endif
   void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace)
   {
     if (memspace == "cpu") {
       cpuImpl_->setValuesChanged(isValuesChanged);
     } else if (memspace == "cuda") {
       cudaImpl_->setValuesChanged(isValuesChanged);
+    } else if (memspace == "hip") {
+      hipImpl_->setValuesChanged(isValuesChanged);
     } else {
       out::error() << "Unsupported device " << memspace << "\n";
     }
@@ -230,6 +253,8 @@ namespace ReSolve {
     } else {
       if (memspace == "cuda"){      
         A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
+      } else if (memspace == "hip"){      
+        A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
       } else {
         //display error
       }
@@ -269,6 +294,9 @@ namespace ReSolve {
       return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else if (memspace == "cpu") {
         return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
+    } else if (memspace == "hip") {
+      printf("about to run mv");
+        return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else {
         out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl;
         return 1;
@@ -280,6 +308,8 @@ namespace ReSolve {
   {
     if (memspace == "cuda") { 
       return cudaImpl_->csc2csr(A_csc, A_csr);
+    } else if (memspace == "hip") {
+      return hipImpl_->csc2csr(A_csc, A_csr);
     } else if (memspace == "cpu") { 
       out::warning() << "Using untested csc2csr on CPU ..." << std::endl;
       return cpuImpl_->csc2csr(A_csc, A_csr);
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index 398a8039..cec61085 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -18,6 +18,7 @@ namespace ReSolve
   }
   class LinAlgWorkspaceCpu;
   class LinAlgWorkspaceCUDA;
+  class LinAlgWorkspaceHIP;
   class MatrixHandlerImpl;
 }
 
@@ -48,6 +49,7 @@ namespace ReSolve {
       MatrixHandler();
       MatrixHandler(LinAlgWorkspaceCpu* workspace);
       MatrixHandler(LinAlgWorkspaceCUDA* workspace);
+      MatrixHandler(LinAlgWorkspaceHIP* workspace);
       ~MatrixHandler();
 
       int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace);
@@ -70,9 +72,11 @@ namespace ReSolve {
       MemoryHandler mem_;      ///< Device memory manager object
       MatrixHandlerImpl*  cpuImpl_{nullptr}; ///< Pointer to CPU implementation
       MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation
+      MatrixHandlerImpl* hipImpl_{nullptr}; ///< Pointer to HIP implementation
 
       bool isCpuEnabled_{false};  ///< true if CPU  implementation is instantiated
       bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated
+      bool isHipEnabled_{false}; ///< true if HIP implementation is instantiated
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp
new file mode 100644
index 00000000..370849fa
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerHip.cpp
@@ -0,0 +1,154 @@
+#include <algorithm>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/workspace/LinAlgWorkspaceHIP.hpp>
+#include "MatrixHandlerHip.hpp"
+
+namespace ReSolve {
+  // Create a shortcut name for Logger static class
+  using out = io::Logger;
+
+  MatrixHandlerHip::~MatrixHandlerHip()
+  {
+  }
+
+  MatrixHandlerHip::MatrixHandlerHip(LinAlgWorkspaceHIP* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  void MatrixHandlerHip::setValuesChanged(bool values_changed)
+  {
+    values_changed_ = values_changed;
+  }
+
+
+  int MatrixHandlerHip::matvec(matrix::Sparse* Ageneric, 
+                               vector_type* vec_x, 
+                               vector_type* vec_result, 
+                               const real_type* alpha, 
+                               const real_type* beta,
+                               std::string matrixFormat) 
+  {
+    using namespace constants;
+    int error_sum = 0;
+    if (matrixFormat == "csr") {
+      matrix::Csr* A = dynamic_cast<matrix::Csr*>(Ageneric);
+      //result = alpha *A*x + beta * result
+      rocsparse_status status;
+      LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+
+      rocsparse_handle handle_rocsparse = workspaceHIP->getRocsparseHandle();
+      
+      rocsparse_mat_info infoA = workspaceHIP->getSpmvMatrixInfo();
+      rocsparse_mat_descr descrA =  workspaceHIP->getSpmvMatrixDescriptor();
+      
+      if (!workspaceHIP->matvecSetup()) {
+        //setup first, allocate, etc.
+        
+        rocsparse_create_mat_descr(&(descrA));
+        rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero);
+        rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general);
+
+        rocsparse_create_mat_info(&infoA);
+        
+        status = rocsparse_dcsrmv_analysis(handle_rocsparse,
+                                           rocsparse_operation_none,
+                                           A->getNumRows(),
+                                           A->getNumColumns(),
+                                           A->getNnzExpanded(), 
+                                           descrA,
+                                           A->getValues("cuda"), 
+                                           A->getRowData("cuda"),
+                                           A->getColData("cuda"), // cuda is used as "device"
+                                           infoA);
+        error_sum += status;
+        mem_.deviceSynchronize();
+
+        workspaceHIP->matvecSetupDone();
+      } 
+      
+      status = rocsparse_dcsrmv(handle_rocsparse,
+                                rocsparse_operation_none,
+                                A->getNumRows(),
+                                A->getNumColumns(),
+                                A->getNnzExpanded(),
+                                alpha, 
+                                descrA,
+                                A->getValues("cuda"), 
+                                A->getRowData("cuda"),
+                                A->getColData("cuda"),
+                                infoA,
+                                vec_x->getData("cuda"),
+                                beta,
+                                vec_result->getData("cuda"));
+
+      error_sum += status;
+      mem_.deviceSynchronize();
+      if (status)
+        out::error() << "Matvec status: " << status 
+                      << "Last error code: " << mem_.getLastDeviceError() << std::endl;
+      vec_result->setDataUpdated("cuda");
+
+      return error_sum;
+    } else {
+      out::error() << "MatVec not implemented (yet) for " 
+                   << matrixFormat << " matrix format." << std::endl;
+      return 1;
+    }
+  }
+
+  int MatrixHandlerHip::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */)
+  {
+    return -1;
+  }
+
+  int MatrixHandlerHip::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr)
+  {
+    index_type error_sum = 0;
+    LinAlgWorkspaceHIP* workspaceHIP = (LinAlgWorkspaceHIP*) workspace_;
+
+    rocsparse_status status;
+    
+    A_csr->allocateMatrixData("cuda");
+    index_type n = A_csc->getNumRows();
+    index_type m = A_csc->getNumRows();
+    index_type nnz = A_csc->getNnz();
+    size_t bufferSize;
+    void* d_work;
+
+    status = rocsparse_csr2csc_buffer_size(workspaceHIP->getRocsparseHandle(),
+                                           n,
+                                           m,
+                                           nnz,
+                                           A_csc->getColData("cuda"), 
+                                           A_csc->getRowData("cuda"), 
+                                           rocsparse_action_numeric,
+                                           &bufferSize);
+
+    error_sum += status;
+    mem_.allocateBufferOnDevice(&d_work, bufferSize);
+    
+    status = rocsparse_dcsr2csc(workspaceHIP->getRocsparseHandle(),
+                                n,
+                                m,
+                                nnz,
+                                A_csc->getValues("cuda"), 
+                                A_csc->getColData("cuda"), 
+                                A_csc->getRowData("cuda"), 
+                                A_csr->getValues("cuda"), 
+                                A_csr->getRowData("cuda"),
+                                A_csr->getColData("cuda"), 
+                                rocsparse_action_numeric,
+                                rocsparse_index_base_zero,
+                                d_work);
+    error_sum += status;
+    return error_sum;
+    mem_.deleteOnDevice(d_work);
+  }
+
+} // namespace ReSolve
diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp
new file mode 100644
index 00000000..7f06f3bd
--- /dev/null
+++ b/resolve/matrix/MatrixHandlerHip.hpp
@@ -0,0 +1,60 @@
+#pragma once
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/matrix/MatrixHandlerImpl.hpp>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  namespace matrix
+  {
+    class Sparse;
+    class Coo;
+    class Csc;
+    class Csr;
+  }
+  class LinAlgWorkspaceHIP;
+}
+
+
+namespace ReSolve {
+  /**
+   * @class MatrixHandlerHip
+   * 
+   * @brief HIP implementation of the matrix handler.
+   */
+  class MatrixHandlerHip : public MatrixHandlerImpl
+  {
+    using vector_type = vector::Vector;
+    
+    public:
+      
+      MatrixHandlerHip(LinAlgWorkspaceHIP* workspace);
+      virtual ~MatrixHandlerHip();
+
+      int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
+      
+      virtual int matvec(matrix::Sparse* A,
+                 vector_type* vec_x,
+                 vector_type* vec_result,
+                 const real_type* alpha,
+                 const real_type* beta,
+                 std::string matrix_type);
+      
+      virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
+      
+      void setValuesChanged(bool isValuesChanged); 
+    
+    private: 
+      
+      LinAlgWorkspaceHIP* workspace_{nullptr};
+      bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
+  };
+
+} // namespace ReSolve
+
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 16d53010..89b1abc8 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -18,6 +18,13 @@ set(Vector_CUDASDK_SRC
     VectorHandlerCuda.cpp
 )
 
+#and hip
+
+set(Vector_ROCM_SRC 
+  VectorHandlerHip.cpp
+)
+
+
 # Header files to be installed
 set(Vector_HEADER_INSTALL
     Vector.hpp
@@ -30,6 +37,11 @@ if(RESOLVE_USE_CUDA)
   set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC})
 endif()
 
+# and hip
+if(RESOLVE_USE_HIP)
+  set(Vector_SRC ${Vector_SRC} ${Vector_ROCM_SRC})
+endif()
+
 add_library(resolve_vector SHARED ${Vector_SRC})
 target_link_libraries(resolve_vector PRIVATE resolve_logger)
 
@@ -38,6 +50,10 @@ if (RESOLVE_USE_CUDA)
   target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
 endif()
 
+if (RESOLVE_USE_HIP)
+  target_link_libraries(resolve_vector PUBLIC resolve_backend_hip)
+endif()
+
 # If no GPU is enabled link to dummy device backend
 if(NOT RESOLVE_USE_GPU)
   target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp
index 7934e8b0..37779ea5 100644
--- a/resolve/vector/Vector.cpp
+++ b/resolve/vector/Vector.cpp
@@ -60,7 +60,7 @@ namespace ReSolve { namespace vector {
       cpu_updated_ = true;
       gpu_updated_ = false;
     } else {
-      if (memspace == "cuda") { 
+      if ((memspace == "cuda") || (memspace == "hip")) {
         d_data_ = data;
         gpu_updated_ = true;
         cpu_updated_ = false;
@@ -76,7 +76,7 @@ namespace ReSolve { namespace vector {
       cpu_updated_ = true;
       gpu_updated_ = false;
     } else {
-      if (memspace == "cuda") { 
+      if ((memspace == "cuda") || (memspace == "hip")) {
         gpu_updated_ = true;
         cpu_updated_ = false;
       } else {
@@ -89,15 +89,15 @@ namespace ReSolve { namespace vector {
   {
     int control=-1;
     if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
 
     if ((memspaceOut == "cpu") && (h_data_ == nullptr)){
       //allocate first
       h_data_ = new real_type[n_ * k_]; 
     }
-    if ((memspaceOut == "cuda") && (d_data_ == nullptr)){
+    if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){
       //allocate first
       mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
@@ -109,19 +109,19 @@ namespace ReSolve { namespace vector {
         cpu_updated_ = true;
         gpu_updated_ = false;
         break;
-      case 2: //cuda->cpu
+      case 2: //gpu->cpu
         mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         cpu_updated_ = true;
         gpu_updated_ = false;
         break;
-      case 1: //cpu->cuda
+      case 1: //cpu->gpu
         mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         gpu_updated_ = true;
         cpu_updated_ = false;
         break;
-      case 3: //cuda->cuda
+      case 3: //gpu->gpu
         mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         gpu_updated_ = true;
@@ -141,18 +141,18 @@ namespace ReSolve { namespace vector {
   real_type* Vector::getData(index_type i, std::string memspace)
   {
     if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) {
-      copyData("cuda", "cpu");
+      copyData(memspace, "cpu");
       owns_cpu_data_ = true;
     } 
 
-    if ((memspace == "cuda") && (gpu_updated_ == false) && (cpu_updated_ == true )) {
-      copyData("cpu", "cuda");
+    if (((memspace == "cuda") || (memspace == "hip")) && (gpu_updated_ == false) && (cpu_updated_ == true )) {
+      copyData("cpu", memspace);
       owns_gpu_data_ = true;
     }
     if (memspace == "cpu") {
       return &h_data_[i * n_current_];
     } else {
-      if (memspace == "cuda"){
+      if ((memspace == "cuda") || (memspace == "hip")){
         return &d_data_[i * n_current_];
       } else {
         return nullptr;
@@ -164,14 +164,14 @@ namespace ReSolve { namespace vector {
   int Vector::copyData(std::string memspaceIn, std::string memspaceOut)
   {
     int control=-1;
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 0;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 1;}
+    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 0;}
+    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 1;}
 
     if ((memspaceOut == "cpu") && (h_data_ == nullptr)){
       //allocate first
       h_data_ = new real_type[n_ * k_]; 
     }
-    if ((memspaceOut == "cuda") && (d_data_ == nullptr)){
+    if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){
       //allocate first
       mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
@@ -200,10 +200,12 @@ namespace ReSolve { namespace vector {
       h_data_ = new real_type[n_ * k_]; 
       owns_cpu_data_ = true;
     } else {
-      if (memspace == "cuda") {
+      if ((memspace == "cuda") || (memspace == "hip")) {
         mem_.deleteOnDevice(d_data_);
         mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
         owns_gpu_data_ = true;
+      } else {
+        std::cout<<"wrong memspace " <<memspace<<" "<<std::endl;
       }
     }
   }
@@ -220,7 +222,7 @@ namespace ReSolve { namespace vector {
         h_data_[i] = 0.0;
       }
     } else {
-      if (memspace == "cuda") {
+      if ((memspace == "cuda") || (memspace == "hip")) {
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
@@ -241,7 +243,7 @@ namespace ReSolve { namespace vector {
         h_data_[i] = 0.0;
       }
     } else {
-      if (memspace == "cuda") {
+      if ((memspace == "cuda") || (memspace == "hip")) {
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
@@ -263,7 +265,7 @@ namespace ReSolve { namespace vector {
         h_data_[i] = C;
       }
     } else {
-      if (memspace == "cuda") {
+      if ((memspace == "cuda") || (memspace == "hip")) {
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
@@ -284,7 +286,7 @@ namespace ReSolve { namespace vector {
         h_data_[i] = C;
       }
     } else {
-      if (memspace == "cuda") {
+      if ((memspace == "cuda") || (memspace == "hip")) {
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
@@ -322,7 +324,7 @@ namespace ReSolve { namespace vector {
       if (memspaceOut == "cpu") {
         std::memcpy(dest, data, n_current_ * sizeof(real_type));
       } else {
-        if (memspaceOut == "cuda") { 
+      if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
           mem_.copyArrayDeviceToDevice(dest, data, n_current_);
         } else {
           //error
@@ -338,7 +340,7 @@ namespace ReSolve { namespace vector {
     if (memspaceOut == "cpu") {
       std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type));
     } else {
-      if (memspaceOut == "cuda") { 
+      if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
         mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_);
       } else {
         //error
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 8c89cb2f..f797f483 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -11,6 +11,9 @@
 #ifdef RESOLVE_USE_CUDA
 #include <resolve/vector/VectorHandlerCuda.hpp>
 #endif
+#ifdef RESOLVE_USE_HIP
+#include <resolve/vector/VectorHandlerHip.hpp>
+#endif
 
 namespace ReSolve {
   using out = io::Logger;
@@ -50,6 +53,21 @@ namespace ReSolve {
     isCpuEnabled_  = true;
   }
 #endif
+#ifdef RESOLVE_USE_HIP
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace)
+  {
+    hipImpl_ = new VectorHandlerHip(new_workspace);
+    cpuImpl_  = new  VectorHandlerCpu();
+
+    isHipEnabled_ = true;
+    isCpuEnabled_  = true;
+  }
+#endif
 
   /** 
    * @brief destructor     
@@ -64,7 +82,7 @@ namespace ReSolve {
    * 
    * @param[in] x The first vector
    * @param[in] y The second vector
-   * @param[in] memspace String containg memspace (cpu or cuda)
+   * @param[in] memspace String containg memspace (cpu or cuda or hip)
    * 
    * @return dot product (real number) of _x_ and _y_
    */
@@ -74,7 +92,9 @@ namespace ReSolve {
     if (memspace == "cuda" ) {
       return cudaImpl_->dot(x, y);
     } else {
-      if (memspace == "cpu") {
+      if (memspace == "hip") { 
+        return hipImpl_->dot(x, y);
+      } else if (memspace == "cpu") {
         return cpuImpl_->dot(x, y);
       } else {
         out::error() << "Not implemented (yet)" << std::endl;
@@ -88,13 +108,15 @@ namespace ReSolve {
    * 
    * @param[in] alpha The constant
    * @param[in,out] x The vector
-   * @param memspace string containg memspace (cpu or cuda)
+   * @param memspace string containg memspace (cpu or cuda or hip)
    * 
    */
   void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace)
   {
     if (memspace == "cuda" ) {
       cudaImpl_->scal(alpha, x);
+    } else if (memspace == "hip") { 
+      hipImpl_->scal(alpha, x);
     } else {
       if (memspace == "cpu") {
         cpuImpl_->scal(alpha, x);
@@ -110,7 +132,7 @@ namespace ReSolve {
    * @param[in] alpha The constant
    * @param[in] x The first vector
    * @param[in,out] y The second vector (result is return in y)
-   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * @param[in]  memspace String containg memspace (cpu or cuda or hip)
    * 
    */
   void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace)
@@ -119,10 +141,14 @@ namespace ReSolve {
     if (memspace == "cuda" ) {
       cudaImpl_->axpy(alpha, x, y);
     } else {
-      if (memspace == "cpu") {
-        cpuImpl_->axpy(alpha, x, y);
+      if (memspace == "hip" ) {
+        hipImpl_->axpy(alpha, x, y);      
       } else {
-        out::error() <<"Not implemented (yet)" << std::endl;
+        if (memspace == "cpu") {
+          cpuImpl_->axpy(alpha, x, y);
+        } else {
+          out::error() <<"Not implemented (yet)" << std::endl;
+        }
       }
     }
   }
@@ -139,7 +165,7 @@ namespace ReSolve {
    * @param[in] V Multivector containing the matrix, organized columnwise
    * @param[in] y Vector, k x 1 if N and n x 1 if T
    * @param[in,out] x Vector, n x 1 if N and k x 1 if T
-   * @param[in] memspace  cpu or cuda (for now)
+   * @param[in] memspace  cpu or cuda or hip (for now)
    *
    * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
    * 
@@ -148,6 +174,8 @@ namespace ReSolve {
   {
     if (memspace == "cuda") {
       cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
+    } else if (memspace == "hip") {
+      hipImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
     } else if (memspace == "cpu") {
       cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
     } else {
@@ -162,7 +190,7 @@ namespace ReSolve {
    * @param[in] alpha vector size k x 1
    * @param[in] x (multi)vector size size x k
    * @param[in,out] y vector size size x 1 (this is where the result is stored)
-   * @param[in] memspace string containg memspace (cpu or cuda)
+   * @param[in] memspace string containg memspace (cpu or cuda or hip)
    *
    * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
    *
@@ -172,6 +200,8 @@ namespace ReSolve {
     using namespace constants;
     if (memspace == "cuda") {
       cudaImpl_->massAxpy(size, alpha, k, x, y);
+    } else if (memspace == "hip") {
+      hipImpl_->massAxpy(size, alpha, k, x, y);
     } else if (memspace == "cpu") {
       cpuImpl_->massAxpy(size, alpha, k, x, y);
     } else {
@@ -188,7 +218,7 @@ namespace ReSolve {
    * @param[in] k Number of vectors in V
    * @param[in] x Multivector; 2 vectors size n x 1 each
    * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
-   * @param[in] memspace String containg memspace (cpu or cuda)
+   * @param[in] memspace String containg memspace (cpu or cuda or hip)
    *
    * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
    *
@@ -197,6 +227,8 @@ namespace ReSolve {
   {
     if (memspace == "cuda") {
       cudaImpl_->massDot2Vec(size, V, k, x, res);
+    } else if (memspace == "hip") {
+      hipImpl_->massDot2Vec(size, V, k, x, res);
     } else if (memspace == "cpu") {
       cpuImpl_->massDot2Vec(size, V, k, x, res);
     } else {
diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp
index c17d4688..02d426b5 100644
--- a/resolve/vector/VectorHandler.hpp
+++ b/resolve/vector/VectorHandler.hpp
@@ -10,6 +10,7 @@ namespace ReSolve
   class VectorHandlerImpl;
   class LinAlgWorkspaceCpu;
   class LinAlgWorkspaceCUDA;
+  class LinAlgWorkspaceHIP;
 }
 
 
@@ -19,6 +20,7 @@ namespace ReSolve { //namespace vector {
       VectorHandler();
       VectorHandler(LinAlgWorkspaceCpu* new_workspace);
       VectorHandler(LinAlgWorkspaceCUDA* new_workspace);
+      VectorHandler(LinAlgWorkspaceHIP* new_workspace);
       ~VectorHandler();
 
       //y = alpha x + y
@@ -55,9 +57,11 @@ namespace ReSolve { //namespace vector {
     private:
       VectorHandlerImpl*  cpuImpl_{nullptr};
       VectorHandlerImpl* cudaImpl_{nullptr};
+      VectorHandlerImpl*  hipImpl_{nullptr};
 
       bool isCpuEnabled_{false};
       bool isCudaEnabled_{false};
+      bool isHipEnabled_{false};
   };
 
 } //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp
new file mode 100644
index 00000000..9f2927c7
--- /dev/null
+++ b/resolve/vector/VectorHandlerHip.cpp
@@ -0,0 +1,236 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/hip/hipKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerHip.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerHip::VectorHandlerHip()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerHip:: VectorHandlerHip(LinAlgWorkspaceHIP* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerHip::~VectorHandlerHip()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or hip)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerHip::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+    rocblas_handle  handle_rocblas =  workspaceHIP->getRocblasHandle();
+    double nrm = 0.0;
+    rocblas_status st= rocblas_ddot (handle_rocblas,  x->getSize(), x->getData("hip"), 1, y->getData("hip"), 1, &nrm);
+    if (st!=0) {printf("dot product crashed with code %d \n", st);}
+    return nrm;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or hip)
+   * 
+   */
+  void VectorHandlerHip::scal(const real_type* alpha, vector::Vector* x)
+  {
+    LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+    rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
+    rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData("hip"), 1);
+    if (st!=0) {
+      ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or hip)
+   * 
+   */
+  void VectorHandlerHip::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+    rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
+    rocblas_daxpy(handle_rocblas,
+                  x->getSize(),
+                  alpha,
+                  x->getData("hip"),
+                  1,
+                  y->getData("hip"),
+                  1);
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or hip (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerHip::gemv(std::string transpose,
+                              index_type n,
+                              index_type k,
+                              const real_type* alpha,
+                              const real_type* beta,
+                              vector::Vector* V,
+                              vector::Vector* y,
+                              vector::Vector* x)
+  {
+    LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+    rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
+    if (transpose == "T") {
+
+      rocblas_dgemv(handle_rocblas,
+                    rocblas_operation_transpose,
+                    n,
+                    k,
+                    alpha,
+                    V->getData("hip"),
+                    n,
+                    y->getData("hip"),
+                    1,
+                    beta,
+                    x->getData("hip"),
+                    1);
+
+    } else {
+      rocblas_dgemv(handle_rocblas,
+                    rocblas_operation_none,
+                    n,
+                    k,
+                    alpha,
+                    V->getData("hip"),
+                    n,
+                    y->getData("hip"),
+                    1,
+                    beta,
+                    x->getData("hip"),
+                    1);
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or hip)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerHip::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    using namespace constants;
+    if (k < 200) {
+      mass_axpy(size, k, x->getData("hip"), y->getData("hip"),alpha->getData("hip"));
+    } else {
+      LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+      rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
+      rocblas_dgemm(handle_rocblas,
+                    rocblas_operation_none,
+                    rocblas_operation_none,
+                    size,       // m
+                    1,          // n
+                    k,      // k
+                    &MINUSONE, // alpha
+                    x->getData("hip"), // A
+                    size,       // lda
+                    alpha->getData("hip"), // B
+                    k,      // ldb
+                    &ONE,
+                    y->getData("hip"),          // c
+                    size);      // ldc     
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector
+   * (a dense multivector consisting of k vectors size n) and x is k x 2 dense
+   * multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or hip)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerHip::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    using namespace constants;
+
+    if (k < 200) {
+      mass_inner_product_two_vectors(size, k, x->getData("hip") , x->getData(1, "hip"), V->getData("hip"), res->getData("hip"));
+    } else {
+      LinAlgWorkspaceHIP* workspaceHIP = workspace_;
+      rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
+      rocblas_dgemm(handle_rocblas,
+                    rocblas_operation_transpose,
+                    rocblas_operation_none,
+                    k + 1,   //m
+                    2,       //n
+                    size,    //k
+                    &ONE,   //alpha
+                    V->getData("hip"),       //A
+                    size,    //lda
+                    x->getData("hip"),       //B
+                    size,    //ldb
+                    &ZERO,
+                    res->getData("hip"),     //c
+                    k + 1);  //ldc 
+    }
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerHip.hpp b/resolve/vector/VectorHandlerHip.hpp
new file mode 100644
index 00000000..7e5085e3
--- /dev/null
+++ b/resolve/vector/VectorHandlerHip.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspaceHIP;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerHip : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerHip();
+      VectorHandlerHip(LinAlgWorkspaceHIP* workspace);
+      virtual ~VectorHandlerHip();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspaceHIP* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
index 673fac4b..a34c2191 100644
--- a/resolve/workspace/CMakeLists.txt
+++ b/resolve/workspace/CMakeLists.txt
@@ -16,10 +16,15 @@ set(ReSolve_Workspace_CUDASDK_SRC
     LinAlgWorkspaceCUDA.cpp
 )
 
+set(ReSolve_Workspace_ROCM_SRC
+  LinAlgWorkspaceHIP.cpp
+)
+
 set(ReSolve_Workspace_HEADER_INSTALL
   LinAlgWorkspace.hpp
   LinAlgWorkspaceCpu.hpp
   LinAlgWorkspaceCUDA.hpp
+  LinAlgWorkspaceHIP.hpp
 )
 
 # If cuda is enabled, add CUDA SDK workspace files
@@ -27,6 +32,10 @@ if(RESOLVE_USE_CUDA)
   set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC})
 endif()
 
+if(RESOLVE_USE_HIP)
+  set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_ROCM_SRC})
+endif()
+
 add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC})
 
 # If CUDA is enabled, link to ReSolve CUDA backend
@@ -34,6 +43,10 @@ if(RESOLVE_USE_CUDA)
   target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda)
 endif(RESOLVE_USE_CUDA)  
 
+if(RESOLVE_USE_HIP)
+  target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip)
+endif(RESOLVE_USE_HIP)  
+
 target_include_directories(resolve_workspace INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
     $<INSTALL_INTERFACE:include>
diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp
index 6da58fda..4efe834e 100644
--- a/resolve/workspace/LinAlgWorkspace.hpp
+++ b/resolve/workspace/LinAlgWorkspace.hpp
@@ -6,3 +6,7 @@
 #include <resolve/workspace/LinAlgWorkspaceCUDA.hpp>
 #endif
 
+#ifdef RESOLVE_USE_HIP
+#include <resolve/workspace/LinAlgWorkspaceHIP.hpp>
+#endif
+
diff --git a/resolve/workspace/LinAlgWorkspaceHIP.cpp b/resolve/workspace/LinAlgWorkspaceHIP.cpp
new file mode 100644
index 00000000..e64dff17
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceHIP.cpp
@@ -0,0 +1,75 @@
+#include <resolve/workspace/LinAlgWorkspaceHIP.hpp>
+
+namespace ReSolve
+{
+  LinAlgWorkspaceHIP::LinAlgWorkspaceHIP()
+  {
+    handle_rocsparse_   = nullptr;
+    handle_rocblas_     = nullptr;
+
+    matvec_setup_done_ = false;
+  }
+
+  LinAlgWorkspaceHIP::~LinAlgWorkspaceHIP()
+  {
+    rocsparse_destroy_handle(handle_rocsparse_);
+    rocblas_destroy_handle(handle_rocblas_);
+    rocsparse_destroy_mat_descr(mat_A_);
+  }
+
+  rocsparse_handle LinAlgWorkspaceHIP::getRocsparseHandle()
+  {
+    return handle_rocsparse_;
+  }
+
+  void LinAlgWorkspaceHIP::setRocsparseHandle(rocsparse_handle handle)
+  {
+    handle_rocsparse_ = handle;
+  }
+
+  rocblas_handle LinAlgWorkspaceHIP::getRocblasHandle()
+  {
+    return handle_rocblas_;
+  }
+
+  void LinAlgWorkspaceHIP::setRocblasHandle(rocblas_handle handle)
+  {
+    handle_rocblas_ = handle;
+  }
+
+  rocsparse_mat_descr LinAlgWorkspaceHIP::getSpmvMatrixDescriptor()
+  {
+    return mat_A_;
+  }
+
+  void LinAlgWorkspaceHIP::setSpmvMatrixDescriptor(rocsparse_mat_descr mat)
+  {
+    mat_A_ = mat;
+  }
+
+  rocsparse_mat_info  LinAlgWorkspaceHIP::getSpmvMatrixInfo()
+  {
+    return info_A_;
+  }
+
+  void LinAlgWorkspaceHIP::setSpmvMatrixInfo(rocsparse_mat_info  info)
+  {
+    info_A_ = info;
+  }
+
+  bool LinAlgWorkspaceHIP::matvecSetup()
+  {
+    return matvec_setup_done_;
+  }
+
+  void LinAlgWorkspaceHIP::matvecSetupDone()
+  {
+    matvec_setup_done_ = true;
+  }
+
+  void LinAlgWorkspaceHIP::initializeHandles()
+  {
+    rocsparse_create_handle(&handle_rocsparse_);
+                            rocblas_create_handle(&handle_rocblas_);
+                            }
+                            } // namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp
new file mode 100644
index 00000000..fbb55349
--- /dev/null
+++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <rocsparse/rocsparse.h>
+#include <rocblas/rocblas.h>
+#include <hip/hip_runtime.h>
+
+#include <resolve/MemoryUtils.hpp>
+
+namespace ReSolve
+{
+  class LinAlgWorkspaceHIP
+  {
+    public:
+      LinAlgWorkspaceHIP();
+      ~LinAlgWorkspaceHIP();
+
+      rocblas_handle getRocblasHandle();
+      rocsparse_handle getRocsparseHandle();      
+      rocsparse_mat_descr getSpmvMatrixDescriptor();
+      rocsparse_mat_info getSpmvMatrixInfo();
+
+      void setRocblasHandle(rocblas_handle handle);
+      void setRocsparseHandle(rocsparse_handle handle);
+      void setSpmvMatrixDescriptor(rocsparse_mat_descr mat);
+      void setSpmvMatrixInfo(rocsparse_mat_info info);
+
+      void initializeHandles();
+
+      bool matvecSetup();
+      void matvecSetupDone();
+
+    private:
+      //handles
+      rocblas_handle handle_rocblas_;
+      rocsparse_handle handle_rocsparse_;
+
+      //matrix descriptors
+      rocsparse_mat_descr  mat_A_; 
+
+      //vector descriptors not needed, rocsparse uses RAW pointers.
+
+      //buffers
+      // there is no buffer needed in matvec
+      bool matvec_setup_done_; //check if setup is done for matvec (note: no buffer but there is analysis)
+
+      //info - but we need info
+      rocsparse_mat_info  info_A_;
+
+      MemoryHandler mem_;
+  };
+
+} // namespace ReSolve
diff --git a/tests/unit/matrix/CMakeLists.txt b/tests/unit/matrix/CMakeLists.txt
index 8906c2c6..8476f181 100644
--- a/tests/unit/matrix/CMakeLists.txt
+++ b/tests/unit/matrix/CMakeLists.txt
@@ -20,4 +20,4 @@ install(TARGETS ${installable_tests}
         RUNTIME DESTINATION bin/resolve/tests/unit)
 
 add_test(NAME matrix_test         COMMAND $<TARGET_FILE:runMatrixIoTests.exe>)
-add_test(NAME matrix_handler_test COMMAND $<TARGET_FILE:runMatrixHandlerTests.exe>)
\ No newline at end of file
+add_test(NAME matrix_handler_test COMMAND $<TARGET_FILE:runMatrixHandlerTests.exe>)
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index e203017a..d7fe8449 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -49,6 +49,7 @@ class MatrixHandlerTests : TestBase
     vector::Vector x(N);
     vector::Vector y(N);
     x.allocate(memspace_);
+    if (x.getData(memspace_) == NULL) printf("oups we have an issue \n");
     y.allocate(memspace_);
 
     x.setToConst(1.0, memspace_);
@@ -80,6 +81,12 @@ class MatrixHandlerTests : TestBase
       LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
       workspace->initializeHandles();
       return new MatrixHandler(workspace);
+#endif
+#ifdef RESOLVE_USE_HIP
+    } else if (memspace_ == "hip") {
+      LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP();
+      workspace->initializeHandles();
+      return new MatrixHandler(workspace);
 #endif
     } else {
       std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
@@ -152,7 +159,7 @@ class MatrixHandlerTests : TestBase
     A->setUpdated("cpu");
     // std::cout << rowptr[i] << "\n";
 
-    if (memspace == "cuda") {
+    if ((memspace == "cuda") || (memspace == "hip")) {
       A->copyData(memspace);
     }
 
diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp
index 6eee90d5..26ad70b0 100644
--- a/tests/unit/matrix/runMatrixHandlerTests.cpp
+++ b/tests/unit/matrix/runMatrixHandlerTests.cpp
@@ -33,5 +33,17 @@ int main(int, char**)
   }
 #endif
 
+#ifdef RESOLVE_USE_HIP
+  {
+    std::cout << "Running tests with HIP backend:\n";
+    ReSolve::tests::MatrixHandlerTests test("hip");
+
+    result += test.matrixHandlerConstructor();
+    result += test.matrixOneNorm();
+    result += test.matVec(50);
+
+    std::cout << "\n";
+  }
+#endif
   return result.summary();
 }
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index d2f8c73c..60020ec5 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include <iomanip>
 #include <sstream>
 #include <iterator>
 #include <algorithm>
@@ -141,13 +142,13 @@ namespace ReSolve {
             }
             x->setToConst(ii, c, memspace_);
           }
+
           index_type r = K % 2;
           real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5));
 
           handler->massAxpy(N, alpha, K, x, y, memspace_);
           status *= verifyAnswer(y, 2.0 - res, memspace_);
-
-
+         
           delete handler;
           delete x;
           delete y;
@@ -229,6 +230,12 @@ namespace ReSolve {
             LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
             workspace->initializeHandles();
             return new VectorHandler(workspace);
+#endif
+#ifdef RESOLVE_USE_HIP
+          } else if (memspace_ == "hip") {
+            LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
 #endif
           } else {
             std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n";
@@ -247,6 +254,7 @@ namespace ReSolve {
           for (index_type i = 0; i < x->getSize(); ++i) {
             // std::cout << x->getData("cpu")[i] << "\n";
             if (!isEqual(x->getData("cpu")[i], answer)) {
+              std::cout << std::setprecision(16);
               status = false;
               std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i]
                 << ", expected: " << answer << "\n";
diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp
index 77e99471..9bb543a5 100644
--- a/tests/unit/vector/runVectorHandlerTests.cpp
+++ b/tests/unit/vector/runVectorHandlerTests.cpp
@@ -37,5 +37,22 @@ int main(int, char**)
   }
 #endif
 
+#ifdef RESOLVE_USE_HIP
+  {
+    std::cout << "Running tests with HIP backend:\n";
+    ReSolve::tests::VectorHandlerTests test("hip");
+
+    result += test.dot(5000);
+    result += test.axpy(5000);
+    result += test.scal(5000);
+    result += test.gemv(5000, 10);
+    result += test.massAxpy(100, 10);
+    result += test.massAxpy(1000, 300);
+    result += test.massDot(100, 10);
+    result += test.massDot(1000, 30);
+
+    std::cout << "\n";
+  }
+#endif
   return result.summary();
 }

From 7ea6515adbd6b282e502598165a5ff043e02257f Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Mon, 30 Oct 2023 20:26:10 -0400
Subject: [PATCH 03/12] Fix warnings in HIP branch. (#41)

* Fix warnings in HIP branch.
---
 resolve/LinSolverDirectKLU.cpp           |  6 ++++--
 resolve/MemoryUtils.hpp                  |  9 +++++++++
 resolve/matrix/Coo.cpp                   |  6 +++---
 resolve/matrix/Csc.cpp                   |  6 +++---
 resolve/matrix/Csr.cpp                   |  6 +++---
 resolve/matrix/MatrixHandlerCpu.hpp      |  2 +-
 resolve/matrix/Sparse.cpp                |  2 +-
 resolve/utilities/logger/Logger.cpp      |  2 +-
 resolve/vector/Vector.cpp                |  6 +++---
 resolve/workspace/LinAlgWorkspaceCpu.cpp |  1 +
 resolve/workspace/LinAlgWorkspaceCpu.hpp |  2 +-
 resolve/workspace/LinAlgWorkspaceHIP.hpp |  2 +-
 tests/unit/matrix/MatrixHandlerTests.hpp | 18 +++++++++---------
 13 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp
index b3f670c4..43b612b3 100644
--- a/resolve/LinSolverDirectKLU.cpp
+++ b/resolve/LinSolverDirectKLU.cpp
@@ -157,7 +157,8 @@ namespace ReSolve
   {
     if (Numeric_ != nullptr){
       P_ = new index_type[A_->getNumRows()];
-      std::memcpy(P_, Numeric_->Pnum, A_->getNumRows() * sizeof(index_type));
+      size_t nrows = static_cast<size_t>(A_->getNumRows());
+      std::memcpy(P_, Numeric_->Pnum, nrows * sizeof(index_type));
       return P_;
     } else {
       return nullptr;
@@ -169,7 +170,8 @@ namespace ReSolve
   {
     if (Numeric_ != nullptr){
       Q_ = new index_type[A_->getNumRows()];
-      std::memcpy(Q_, Symbolic_->Q, A_->getNumRows() * sizeof(index_type));
+      size_t nrows = static_cast<size_t>(A_->getNumRows());
+      std::memcpy(Q_, Symbolic_->Q, nrows * sizeof(index_type));
       return Q_;
     } else {
       return nullptr;
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
index 976279d9..5e2da403 100644
--- a/resolve/MemoryUtils.hpp
+++ b/resolve/MemoryUtils.hpp
@@ -44,6 +44,15 @@ namespace ReSolve
       
       template <typename I, typename T>
       int copyArrayHostToDevice(T* dst, const T* src, I n);
+
+      /// Implemented here as it is always needed
+      template <typename I, typename T>
+      int copyArrayHostToHost(T* dst, const T* src, I n)
+      {
+        size_t nelements = static_cast<size_t>(n);
+        memcpy(dst, src, nelements * sizeof(T));
+        return 0;
+      }
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp
index a91f94a9..eeff7b86 100644
--- a/resolve/matrix/Coo.cpp
+++ b/resolve/matrix/Coo.cpp
@@ -113,9 +113,9 @@ namespace ReSolve
 
     switch(control)  {
       case 0: //cpu->cpu
-        std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type));
-        std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type));
-        std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type));
+        mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current);
+        mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current);
+        mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp
index e2ea765f..f6358df3 100644
--- a/resolve/matrix/Csc.cpp
+++ b/resolve/matrix/Csc.cpp
@@ -109,9 +109,9 @@ namespace ReSolve
 
     switch(control)  {
       case 0: //cpu->cpu
-        std::memcpy(h_col_data_, col_data, (n_ + 1) * sizeof(index_type));
-        std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type));
-        std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type));
+        mem_.copyArrayHostToHost(h_col_data_, col_data,      n_ + 1);
+        mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current);
+        mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp
index dff33b48..04e8dff1 100644
--- a/resolve/matrix/Csr.cpp
+++ b/resolve/matrix/Csr.cpp
@@ -111,9 +111,9 @@ namespace ReSolve
     //copy	
     switch(control)  {
       case 0: //cpu->cpu
-        std::memcpy(h_row_data_, row_data, (n_ + 1) * sizeof(index_type));
-        std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type));
-        std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type));
+        mem_.copyArrayHostToHost(h_row_data_, row_data,      n_ + 1);
+        mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current);
+        mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp
index 0b0afbd3..b6e66066 100644
--- a/resolve/matrix/MatrixHandlerCpu.hpp
+++ b/resolve/matrix/MatrixHandlerCpu.hpp
@@ -50,7 +50,7 @@ namespace ReSolve {
       LinAlgWorkspaceCpu* workspace_{nullptr};
       bool values_changed_{true}; ///< needed for matvec
 
-      MemoryHandler mem_; ///< Device memory manager object
+      // MemoryHandler mem_; ///< Device memory manager object not used for now
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp
index 5c866386..4a16ec98 100644
--- a/resolve/matrix/Sparse.cpp
+++ b/resolve/matrix/Sparse.cpp
@@ -228,7 +228,7 @@ namespace ReSolve { namespace matrix {
 
     switch(control)  {
       case 0: //cpu->cpu
-        std::memcpy(h_val_data_, new_vals, (nnz_current) * sizeof(real_type));
+        mem_.copyArrayHostToHost(h_val_data_, new_vals, nnz_current);
         h_data_updated_ = true;
         owns_cpu_vals_ = true;
         break;
diff --git a/resolve/utilities/logger/Logger.cpp b/resolve/utilities/logger/Logger.cpp
index f2448179..7369978f 100644
--- a/resolve/utilities/logger/Logger.cpp
+++ b/resolve/utilities/logger/Logger.cpp
@@ -59,7 +59,7 @@ namespace ReSolve
      */
     void Logger::updateVerbosity(std::vector<std::ostream*>& output_streams)
     {
-      for (int i = NONE; i <= EVERYTHING; ++i)
+      for (std::size_t i = NONE; i <= EVERYTHING; ++i)
       {
         output_streams[i] = i > verbosity_ ? &nullstream_ : logger_;
       }
diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp
index 37779ea5..df3c475d 100644
--- a/resolve/vector/Vector.cpp
+++ b/resolve/vector/Vector.cpp
@@ -104,7 +104,7 @@ namespace ReSolve { namespace vector {
 
     switch(control)  {
       case 0: //cpu->cpu
-        std::memcpy(h_data_, data, (n_current_ * k_) * sizeof(real_type));
+        mem_.copyArrayHostToHost(h_data_, data, n_current_ * k_);
         owns_cpu_data_ = true;
         cpu_updated_ = true;
         gpu_updated_ = false;
@@ -322,7 +322,7 @@ namespace ReSolve { namespace vector {
     } else {
       real_type* data = this->getData(i, memspaceOut);
       if (memspaceOut == "cpu") {
-        std::memcpy(dest, data, n_current_ * sizeof(real_type));
+        mem_.copyArrayHostToHost(dest, data, n_current_);
       } else {
       if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
           mem_.copyArrayDeviceToDevice(dest, data, n_current_);
@@ -338,7 +338,7 @@ namespace ReSolve { namespace vector {
   {
     real_type* data = this->getData(memspaceOut);
     if (memspaceOut == "cpu") {
-      std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type));
+      mem_.copyArrayHostToHost(dest, data, n_current_ * k_);
     } else {
       if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
         mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_);
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp
index 3ed9aa43..c0f25248 100644
--- a/resolve/workspace/LinAlgWorkspaceCpu.cpp
+++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp
@@ -1,3 +1,4 @@
+#include <cstddef>
 #include "LinAlgWorkspaceCpu.hpp"
 
 namespace ReSolve
diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp
index 00e5f38e..3c056b73 100644
--- a/resolve/workspace/LinAlgWorkspaceCpu.hpp
+++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp
@@ -12,7 +12,7 @@ namespace ReSolve
       ~LinAlgWorkspaceCpu();
       void initializeHandles();
     private:
-      MemoryHandler mem_;
+      // MemoryHandler mem_; ///< Memory handler not needed for now
   };
 
 }
diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp
index fbb55349..abdc3e41 100644
--- a/resolve/workspace/LinAlgWorkspaceHIP.hpp
+++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp
@@ -46,7 +46,7 @@ namespace ReSolve
       //info - but we need info
       rocsparse_mat_info  info_A_;
 
-      MemoryHandler mem_;
+      // MemoryHandler mem_; ///< Memory handler not needed for now
   };
 
 } // namespace ReSolve
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index d7fe8449..0bcfe544 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -125,13 +125,15 @@ class MatrixHandlerTests : TestBase
 
     // std::cout << N << "\n";
 
+    // First compute number of nonzeros
     index_type NNZ = 0;
     for (index_type i = 0; i < N; ++i)
     {
-      NNZ += static_cast<index_type>(data[i%5].size());
+      size_t reminder = static_cast<size_t>(i%5);
+      NNZ += static_cast<index_type>(data[reminder].size());
     }
-    // std::cout << NNZ << "\n";
 
+    // Allocate NxN CSR matrix with NNZ nonzeros
     matrix::Csr* A = new matrix::Csr(N, N, NNZ);
     A->allocateMatrixData("cpu");
 
@@ -139,25 +141,23 @@ class MatrixHandlerTests : TestBase
     index_type* colidx = A->getColData("cpu");
     real_type* val     = A->getValues("cpu"); 
 
+    // Populate CSR matrix using same row pattern as for NNZ calculation
     rowptr[0] = 0;
-    index_type i = 0;
-    for (i=0; i < N; ++i)
+    for (index_type i=0; i < N; ++i)
     {
-      const std::vector<real_type>& row_sample = data[i%5];
+      size_t reminder = static_cast<size_t>(i%5);
+      const std::vector<real_type>& row_sample = data[reminder];
       index_type nnz_per_row = static_cast<index_type>(row_sample.size());
-      // std::cout << nnz_per_row << "\n";
 
       rowptr[i+1] = rowptr[i] + nnz_per_row;
       for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j)
       {
         colidx[j] = (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row));
         // evenly distribute nonzeros ^^^^             ^^^^^^^^ perturb offset
-        val[j] = row_sample[j - rowptr[i]];
-        // std::cout << i << " " << colidx[j] << "  " << val[j] << "\n";
+        val[j] = row_sample[static_cast<size_t>(j - rowptr[i])];
       }
     }
     A->setUpdated("cpu");
-    // std::cout << rowptr[i] << "\n";
 
     if ((memspace == "cuda") || (memspace == "hip")) {
       A->copyData(memspace);

From 949680f3b320f8cb4541d30148f7b3c7f670f7c2 Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Tue, 31 Oct 2023 20:18:27 -0400
Subject: [PATCH 04/12] Use enums instead of strings to denote memory space
 (#42)

* Use enums for memory space ID in matrix classes.

* Use enums for vector class memory space IDs.
---
 examples/r_KLU_GLU.cpp                        |  12 +-
 examples/r_KLU_GLU_matrix_values_update.cpp   |  14 +-
 examples/r_KLU_KLU.cpp                        |   8 +-
 examples/r_KLU_KLU_standalone.cpp             |   6 +-
 examples/r_KLU_rf.cpp                         |   8 +-
 examples/r_KLU_rf_FGMRES.cpp                  |  18 +-
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |  24 +-
 resolve/GramSchmidt.cpp                       |  72 +++---
 resolve/LinSolverDirectCuSolverGLU.cpp        |  44 ++--
 resolve/LinSolverDirectCuSolverRf.cpp         |  32 +--
 resolve/LinSolverDirectKLU.cpp                |  52 ++--
 resolve/LinSolverIterativeFGMRES.cpp          |  18 +-
 resolve/MemoryUtils.hpp                       |  10 +
 resolve/matrix/Coo.cpp                        | 159 ++++++------
 resolve/matrix/Coo.hpp                        |  14 +-
 resolve/matrix/Csc.cpp                        | 159 ++++++------
 resolve/matrix/Csc.hpp                        |  14 +-
 resolve/matrix/Csr.cpp                        | 161 ++++++------
 resolve/matrix/Csr.hpp                        |  14 +-
 resolve/matrix/MatrixHandler.cpp              |  12 +-
 resolve/matrix/MatrixHandlerCpu.cpp           |  24 +-
 resolve/matrix/MatrixHandlerCuda.cpp          |  38 +--
 resolve/matrix/MatrixHandlerHip.cpp           |  36 +--
 resolve/matrix/Sparse.cpp                     | 109 ++++----
 resolve/matrix/Sparse.hpp                     |  24 +-
 resolve/matrix/io.cpp                         |  10 +-
 resolve/vector/Vector.cpp                     | 239 +++++++++---------
 resolve/vector/Vector.hpp                     |  28 +-
 resolve/vector/VectorHandlerCpu.cpp           |  10 +-
 resolve/vector/VectorHandlerCuda.cpp          |  36 +--
 resolve/vector/VectorHandlerHip.cpp           |  36 +--
 tests/functionality/testKLU.cpp               |  24 +-
 tests/functionality/testKLU_GLU.cpp           |  28 +-
 tests/functionality/testKLU_Rf.cpp            |  22 +-
 tests/functionality/testKLU_Rf_FGMRES.cpp     |  28 +-
 tests/unit/matrix/MatrixHandlerTests.hpp      |  35 +--
 tests/unit/matrix/MatrixIoTests.hpp           |   8 +-
 tests/unit/vector/GramSchmidtTests.hpp        |  32 ++-
 tests/unit/vector/VectorHandlerTests.hpp      | 107 +++++---
 39 files changed, 898 insertions(+), 827 deletions(-)

diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp
index e7b19f4e..9f271254 100644
--- a/examples/r_KLU_GLU.cpp
+++ b/examples/r_KLU_GLU.cpp
@@ -93,8 +93,8 @@ int main(int argc, char *argv[])
       x = new real_type[A->getNumRows()];
       vec_rhs = new vector_type(A->getNumRows());
       vec_x = new vector_type(A->getNumRows());
-      vec_x->allocate("cpu");//for KLU
-      vec_x->allocate("cuda");
+      vec_x->allocate(ReSolve::memory::HOST);//for KLU
+      vec_x->allocate(ReSolve::memory::DEVICE);
       vec_r = new vector_type(A->getNumRows());
     } else {
       ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
@@ -107,11 +107,11 @@ int main(int argc, char *argv[])
     //Now convert to CSR.
     if (i < 1) { 
       matrix_handler->coo2csr(A_coo, A,  "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
-      vec_rhs->setDataUpdated("cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
     } else { 
       matrix_handler->coo2csr(A_coo, A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -143,7 +143,7 @@ int main(int argc, char *argv[])
       status = GLU->solve(vec_rhs, vec_x);
       std::cout<<"CUSOLVER GLU solve status: "<<status<<std::endl;      
     }
-    vec_r->update(rhs, "cpu", "cuda");
+    vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
 
     matrix_handler->setValuesChanged(true, "cuda");
diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp
index ee99f0a0..ded685ac 100644
--- a/examples/r_KLU_GLU_matrix_values_update.cpp
+++ b/examples/r_KLU_GLU_matrix_values_update.cpp
@@ -96,8 +96,8 @@ int main(int argc, char *argv[])
       x = new real_type[A->getNumRows()];
       vec_rhs = new vector_type(A->getNumRows());
       vec_x = new vector_type(A->getNumRows());
-      vec_x->allocate("cpu");//for KLU
-      vec_x->allocate("cuda");
+      vec_x->allocate(ReSolve::memory::HOST);//for KLU
+      vec_x->allocate(ReSolve::memory::DEVICE);
       vec_r = new vector_type(A->getNumRows());
     } else {
       if (i==1) {
@@ -106,7 +106,7 @@ int main(int argc, char *argv[])
         ReSolve::io::readAndUpdateMatrix(mat_file, A_exp_coo);
       }
       std::cout<<"Updating values of A_coo!"<<std::endl; 
-      A_coo->updateValues(A_exp_coo->getValues("cpu"), "cpu", "cpu");
+      A_coo->updateValues(A_exp_coo->getValues(ReSolve::memory::HOST), ReSolve::memory::HOST, ReSolve::memory::HOST);
       //ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
       ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
     }
@@ -117,11 +117,11 @@ int main(int argc, char *argv[])
       //Now convert to CSR.
       if (i < 1) { 
         matrix_handler->coo2csr(A_coo, A,  "cpu");
-        vec_rhs->update(rhs, "cpu", "cpu");
-        vec_rhs->setDataUpdated("cpu");
+        vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+        vec_rhs->setDataUpdated(ReSolve::memory::HOST);
       } else { 
         matrix_handler->coo2csr(A_coo, A, "cuda");
-        vec_rhs->update(rhs, "cpu", "cuda");
+        vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       }
       std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
       //Now call direct solver
@@ -153,7 +153,7 @@ int main(int argc, char *argv[])
         status = GLU->solve(vec_rhs, vec_x);
         std::cout<<"CUSOLVER GLU solve status: "<<status<<std::endl;      
       }
-      vec_r->update(rhs, "cpu", "cuda");
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
 
       matrix_handler->setValuesChanged(true, "cuda");
diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp
index b9328e8a..901e36a5 100644
--- a/examples/r_KLU_KLU.cpp
+++ b/examples/r_KLU_KLU.cpp
@@ -108,11 +108,11 @@ int main(int argc, char *argv[])
     //Now convert to CSR.
     if (i < 2) { 
       matrix_handler->coo2csr(A_coo, A, "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
-      vec_rhs->setDataUpdated("cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
     } else { 
       matrix_handler->coo2csr(A_coo, A, "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -134,7 +134,7 @@ int main(int argc, char *argv[])
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
-    vec_r->update(rhs, "cpu", "cpu");
+    vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
     matrix_handler->setValuesChanged(true, "cpu");
 
diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp
index 0b8f6114..3dfaf716 100644
--- a/examples/r_KLU_KLU_standalone.cpp
+++ b/examples/r_KLU_KLU_standalone.cpp
@@ -83,8 +83,8 @@ int main(int argc, char *argv[])
 
   //Now convert to CSR.
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
-  vec_rhs->setDataUpdated("cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
   std::cout << "COO to CSR completed. Expanded NNZ: " << A->getNnzExpanded() << std::endl;
   //Now call direct solver
   KLU->setupParameters(1, 0.1, false);
@@ -96,7 +96,7 @@ int main(int argc, char *argv[])
   std::cout << "KLU factorization status: " << status << std::endl;
   status = KLU->solve(vec_rhs, vec_x);
   std::cout << "KLU solve status: " << status << std::endl;      
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   matrix_handler->setValuesChanged(true, "cpu");
 
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index 7369af18..d9310773 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -107,11 +107,11 @@ int main(int argc, char *argv[] )
     //Now convert to CSR.
     if (i < 2) { 
       matrix_handler->coo2csr(A_coo, A, "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
-      vec_rhs->setDataUpdated("cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
     } else { 
       matrix_handler->coo2csr(A_coo, A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -157,7 +157,7 @@ int main(int argc, char *argv[] )
       //status = KLU->solve(vec_rhs, vec_x);
       //std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
-    vec_r->update(rhs, "cpu", "cuda");
+    vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
     matrix_handler->setValuesChanged(true, "cuda");
 
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index 07839cbb..6df5419a 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -96,8 +96,8 @@ int main(int argc, char *argv[])
       x = new real_type[A->getNumRows()];
       vec_rhs = new vector_type(A->getNumRows());
       vec_x = new vector_type(A->getNumRows());
-      vec_x->allocate("cpu");//for KLU
-      vec_x->allocate("cuda");
+      vec_x->allocate(ReSolve::memory::HOST);//for KLU
+      vec_x->allocate(ReSolve::memory::DEVICE);
       vec_r = new vector_type(A->getNumRows());
     }
     else {
@@ -111,11 +111,11 @@ int main(int argc, char *argv[])
     //Now convert to CSR.
     if (i < 2) { 
       matrix_handler->coo2csr(A_coo, A, "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
-      vec_rhs->setDataUpdated("cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
     } else { 
       matrix_handler->coo2csr(A_coo,A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -133,7 +133,7 @@ int main(int argc, char *argv[])
       std::cout<<"KLU factorization status: "<<status<<std::endl;
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
-      vec_r->update(rhs, "cpu", "cuda");
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
       matrix_handler->setValuesChanged(true, "cuda");
@@ -162,8 +162,8 @@ int main(int argc, char *argv[])
       status = Rf->solve(vec_rhs, vec_x);
       std::cout<<"CUSOLVER RF solve status: "<<status<<std::endl;      
 
-      vec_r->update(rhs, "cpu", "cuda");
-       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+      norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
       //matrix_handler->setValuesChanged(true, "cuda");
@@ -176,7 +176,7 @@ int main(int argc, char *argv[])
                 << std::scientific << std::setprecision(16) 
                 << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n";
 
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       FGMRES->solve(vec_rhs, vec_x);
 
       std::cout << "FGMRES: init nrm: " 
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index 56ab43fe..5ead8186 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -98,8 +98,8 @@ int main(int argc, char *argv[])
       x = new real_type[A->getNumRows()];
       vec_rhs = new vector_type(A->getNumRows());
       vec_x = new vector_type(A->getNumRows());
-      vec_x->allocate("cpu");//for KLU
-      vec_x->allocate("cuda");
+      vec_x->allocate(ReSolve::memory::HOST);//for KLU
+      vec_x->allocate(ReSolve::memory::DEVICE);
       vec_r = new vector_type(A->getNumRows());
     }
     else {
@@ -113,11 +113,11 @@ int main(int argc, char *argv[])
     //Now convert to CSR.
     if (i < 2) { 
       matrix_handler->coo2csr(A_coo,A, "cpu");
-      vec_rhs->update(rhs, "cpu", "cpu");
-      vec_rhs->setDataUpdated("cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
     } else { 
       matrix_handler->coo2csr(A_coo, A, "cuda");
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
     }
     std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
     //Now call direct solver
@@ -135,7 +135,7 @@ int main(int argc, char *argv[])
       std::cout<<"KLU factorization status: "<<status<<std::endl;
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
-      vec_r->update(rhs, "cpu", "cuda");
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
       matrix_handler->setValuesChanged(true, "cuda");
@@ -171,20 +171,20 @@ int main(int argc, char *argv[])
         status = Rf->refactorize();
         std::cout << "CUSOLVER RF, using REAL refactorization, refactorization status: "
                   << status << std::endl;    
-        vec_rhs->update(rhs, "cpu", "cuda");
+        vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
         status = Rf->solve(vec_rhs, vec_x);
         FGMRES->setupPreconditioner("CuSolverRf", Rf);
       }
-      //if (i%2!=0)  vec_x->setToZero("cuda");
+      //if (i%2!=0)  vec_x->setToZero(ReSolve::memory::DEVICE);
       real_type norm_x =  vector_handler->dot(vec_x, vec_x, "cuda");
       std::cout << "Norm of x (before solve): " 
                 << std::scientific << std::setprecision(16) 
                 << sqrt(norm_x) << "\n";
       std::cout<<"CUSOLVER RF solve status: "<<status<<std::endl;      
       
-      vec_rhs->update(rhs, "cpu", "cuda");
-      vec_r->update(rhs, "cpu", "cuda");
-       norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+      norm_b = vector_handler->dot(vec_r, vec_r, "cuda");
       norm_b = sqrt(norm_b);
 
       matrix_handler->setValuesChanged(true, "cuda");
@@ -199,7 +199,7 @@ int main(int argc, char *argv[])
                 << std::scientific << std::setprecision(16) 
                 << norm_b << "\n";
 
-      vec_rhs->update(rhs, "cpu", "cuda");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       FGMRES->solve(vec_rhs, vec_x);
 
       std::cout << "FGMRES: init nrm: " 
diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp
index b6a27b04..fb86fc8d 100644
--- a/resolve/GramSchmidt.cpp
+++ b/resolve/GramSchmidt.cpp
@@ -36,10 +36,10 @@ namespace ReSolve
         delete h_L_;    
         delete h_rv_;    
 
-        vec_rv_->setData(nullptr, "cuda");
-        vec_rv_->setData(nullptr, "cpu");
-        vec_Hcolumn_->setData(nullptr, "cuda");
-        vec_Hcolumn_->setData(nullptr, "cpu");
+        vec_rv_->setData(nullptr, memory::DEVICE);
+        vec_rv_->setData(nullptr, memory::HOST);
+        vec_Hcolumn_->setData(nullptr, memory::DEVICE);
+        vec_Hcolumn_->setData(nullptr, memory::HOST);
 
         delete [] vec_rv_;    
         delete [] vec_Hcolumn_;;    
@@ -47,18 +47,18 @@ namespace ReSolve
 
       if(variant_ == cgs2) {
         delete h_aux_;
-        vec_Hcolumn_->setData(nullptr, "cuda");
-        //        vec_Hcolumn_->setData(nullptr, "cpu");
+        vec_Hcolumn_->setData(nullptr, memory::DEVICE);
+        //        vec_Hcolumn_->setData(nullptr, memory::HOST);
         delete [] vec_Hcolumn_;    
       }    
       if(variant_ == mgs_pm) {
         delete h_aux_;
       }
 
-      vec_v_->setData(nullptr, "cuda");
-      vec_v_->setData(nullptr, "cpu");
-      vec_w_->setData(nullptr, "cuda");
-      vec_w_->setData(nullptr, "cpu");
+      vec_v_->setData(nullptr, memory::DEVICE);
+      vec_v_->setData(nullptr, memory::HOST);
+      vec_w_->setData(nullptr, memory::DEVICE);
+      vec_w_->setData(nullptr, memory::HOST);
 
       delete [] vec_w_;
       delete [] vec_v_;   
@@ -103,15 +103,15 @@ namespace ReSolve
         h_rv_ = new real_type[num_vecs_ + 1];
 
         vec_rv_ = new vector_type(num_vecs_ + 1, 2);
-        vec_rv_->allocate("cuda");      
+        vec_rv_->allocate(memory::DEVICE);      
 
         vec_Hcolumn_ = new vector_type(num_vecs_ + 1);
-        vec_Hcolumn_->allocate("cuda");      
+        vec_Hcolumn_->allocate(memory::DEVICE);      
       }
       if(variant_ == cgs2) {
         h_aux_ = new real_type[num_vecs_ + 1];
         vec_Hcolumn_ = new vector_type(num_vecs_ + 1);
-        vec_Hcolumn_->allocate("cuda");      
+        vec_Hcolumn_->allocate(memory::DEVICE);      
       }
 
       if(variant_ == mgs_pm) {
@@ -135,10 +135,10 @@ namespace ReSolve
       switch (variant_){
         case mgs: 
 
-          vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda");
+          vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
           for(int j = 0; j <= i; ++j) {
             t = 0.0;
-            vec_v_->setData( V->getVectorData(j, "cuda"), "cuda");
+            vec_v_->setData( V->getVectorData(j, memory::DEVICE), memory::DEVICE);
             t = vector_handler_->dot(vec_v_, vec_w_, "cuda");  
             H[ idxmap(i, j, num_vecs_ + 1) ] = t; 
             t *= -1.0;
@@ -159,26 +159,26 @@ namespace ReSolve
           break;
         case cgs2:
 
-          vec_v_->setData(V->getVectorData(i + 1, "cuda"), "cuda");
-          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_,"cuda");
+          vec_v_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
+          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, "cuda");
 
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
           vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" );  
 
           // copy H_col to aux, we will need it later
-          vec_Hcolumn_->setDataUpdated("cuda");
+          vec_Hcolumn_->setDataUpdated(memory::DEVICE);
           vec_Hcolumn_->setCurrentSize(i + 1);
-          vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, "cpu");
+          vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST);
 
           //Hcol = V(:,1:i)^T*V(:,i+1);
-          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_,"cuda");
+          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, "cuda");
 
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
           vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" );  
 
           // copy H_col to H
-          vec_Hcolumn_->setDataUpdated("cuda");
-          vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, "cpu");
+          vec_Hcolumn_->setDataUpdated(memory::DEVICE);
+          vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST);
 
           // add both pieces together (unstable otherwise, careful here!!)
           t = 0.0;
@@ -201,16 +201,16 @@ namespace ReSolve
           break;
         case mgs_two_synch:
           // V[1:i]^T[V[i] w]
-          vec_v_->setData(V->getVectorData(i, "cuda"), "cuda");
-          vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda");
+          vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE);
+          vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
           vec_rv_->setCurrentSize(i + 1);
 
           vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda");
-          vec_rv_->setDataUpdated("cuda");
-          vec_rv_->copyData("cuda", "cpu");
+          vec_rv_->setDataUpdated(memory::DEVICE);
+          vec_rv_->copyData(memory::DEVICE, memory::HOST);
 
-          vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu");
-          h_rv_ = vec_rv_->getVectorData(1, "cpu");
+          vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST);
+          h_rv_ = vec_rv_->getVectorData(1, memory::HOST);
 
           for(int j=0; j<=i; ++j) {
             H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0;
@@ -225,7 +225,7 @@ namespace ReSolve
             H[ idxmap(i, j, num_vecs_ + 1) ] -= s; 
           }   // for j
           vec_Hcolumn_->setCurrentSize(i + 1);
-          vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); 
+          vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); 
           vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda");
 
           // normalize (second synch)
@@ -243,16 +243,16 @@ namespace ReSolve
           return 0;
           break;
         case mgs_pm:
-          vec_v_->setData(V->getVectorData(i, "cuda"), "cuda");
-          vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda");
+          vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE);
+          vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
           vec_rv_->setCurrentSize(i + 1);
 
           vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda");
-          vec_rv_->setDataUpdated("cuda");
-          vec_rv_->copyData("cuda", "cpu");
+          vec_rv_->setDataUpdated(memory::DEVICE);
+          vec_rv_->copyData(memory::DEVICE, memory::HOST);
 
-          vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu");
-          h_rv_ = vec_rv_->getVectorData(1, "cpu");
+          vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST);
+          h_rv_ = vec_rv_->getVectorData(1, memory::HOST);
 
           for(int j = 0; j <= i; ++j) {
             H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0;
@@ -295,7 +295,7 @@ namespace ReSolve
           }
 
           vec_Hcolumn_->setCurrentSize(i + 1);
-          vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); 
+          vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); 
 
           vector_handler_->massAxpy(n, vec_Hcolumn_, i, V,  vec_w_, "cuda");
           // normalize (second synch)
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index 75039ff4..0350efea 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -50,14 +50,14 @@ namespace ReSolve
                                            n,
                                            nnz, 
                                            descr_A_, 
-                                           A_->getRowData("cpu"), //kRowPtr_,
-                                           A_->getColData("cpu"), //jCol_, 
+                                           A_->getRowData(memory::HOST), //kRowPtr_,
+                                           A_->getColData(memory::HOST), //jCol_, 
                                            P, /* base-0 */
                                            Q,   /* base-0 */
                                            M_->getNnz(),           /* nnzM */
                                            descr_M_, 
-                                           M_->getRowData("cpu"), 
-                                           M_->getColData("cpu"), 
+                                           M_->getRowData(memory::HOST), 
+                                           M_->getColData(memory::HOST), 
                                            info_M_);
     error_sum += status_cusolver_; 
     //NOW the buffer 
@@ -77,9 +77,9 @@ namespace ReSolve
                                            /* A is original matrix */
                                            nnz, 
                                            descr_A_, 
-                                           A_->getValues("cuda"),  //da_, 
-                                           A_->getRowData("cuda"), //kRowPtr_,
-                                           A_->getColData("cuda"), //jCol_, 
+                                           A_->getValues( memory::DEVICE), //da_, 
+                                           A_->getRowData(memory::DEVICE), //kRowPtr_,
+                                           A_->getColData(memory::DEVICE), //jCol_, 
                                            info_M_);
     error_sum += status_cusolver_; 
 
@@ -93,15 +93,15 @@ namespace ReSolve
   {
 // L and U need to be in CSC format
     index_type n = L->getNumRows();
-    index_type* Lp = L->getColData("cpu"); 
-    index_type* Li = L->getRowData("cpu"); 
-    index_type* Up = U->getColData("cpu"); 
-    index_type* Ui = U->getRowData("cpu"); 
+    index_type* Lp = L->getColData(memory::HOST); 
+    index_type* Li = L->getRowData(memory::HOST); 
+    index_type* Up = U->getColData(memory::HOST); 
+    index_type* Ui = U->getRowData(memory::HOST); 
     index_type nnzM = ( L->getNnz() + U->getNnz() - n );
     M_ = new matrix::Csr(n, n, nnzM);
-    M_->allocateMatrixData("cpu");
-    index_type* mia = M_->getRowData("cpu");
-    index_type* mja = M_->getColData("cpu");
+    M_->allocateMatrixData(memory::HOST);
+    index_type* mia = M_->getRowData(memory::HOST);
+    index_type* mja = M_->getColData(memory::HOST);
     index_type row;
     for(index_type i = 0; i < n; ++i) {
       // go through EACH COLUMN OF L first
@@ -153,9 +153,9 @@ namespace ReSolve
                                             /* A is original matrix */
                                             A_->getNnzExpanded(),
                                             descr_A_,
-                                            A_->getValues("cuda"),  //da_, 
-                                            A_->getRowData("cuda"), //kRowPtr_,
-                                            A_->getColData("cuda"), //jCol_, 
+                                            A_->getValues( memory::DEVICE), //da_, 
+                                            A_->getRowData(memory::DEVICE), //kRowPtr_,
+                                            A_->getColData(memory::DEVICE), //jCol_, 
                                             info_M_);
     error_sum += status_cusolver_;
 
@@ -173,11 +173,11 @@ namespace ReSolve
                                             /* A is original matrix */
                                             A_->getNnz(),
                                             descr_A_,
-                                            A_->getValues("cuda"),  //da_, 
-                                            A_->getRowData("cuda"), //kRowPtr_,
-                                            A_->getColData("cuda"), //jCol_, 
-                                            rhs->getData("cuda"),/* right hand side */
-                                            x->getData("cuda"),/* left hand side */
+                                            A_->getValues( memory::DEVICE), //da_, 
+                                            A_->getRowData(memory::DEVICE), //kRowPtr_,
+                                            A_->getColData(memory::DEVICE), //jCol_, 
+                                            rhs->getData(memory::DEVICE),/* right hand side */
+                                            x->getData(memory::DEVICE),/* left hand side */
                                             &ite_refine_succ_,
                                             &r_nrminf_,
                                             info_M_,
diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp
index d51218cc..37a3ffda 100644
--- a/resolve/LinSolverDirectCuSolverRf.cpp
+++ b/resolve/LinSolverDirectCuSolverRf.cpp
@@ -35,17 +35,17 @@ namespace ReSolve
     error_sum += status_cusolverrf_;
     status_cusolverrf_ = cusolverRfSetupDevice(n, 
                                                A_->getNnzExpanded(),
-                                               A_->getRowData("cuda"), //dia_,
-                                               A_->getColData("cuda"), //dja_,
-                                               A_->getValues("cuda"),  //da_,
+                                               A_->getRowData(memory::DEVICE), //dia_,
+                                               A_->getColData(memory::DEVICE), //dja_,
+                                               A_->getValues( memory::DEVICE), //da_,
                                                L->getNnz(),
-                                               L->getRowData("cuda"),
-                                               L->getColData("cuda"),
-                                               L->getValues("cuda"),
+                                               L->getRowData(memory::DEVICE),
+                                               L->getColData(memory::DEVICE),
+                                               L->getValues( memory::DEVICE),
                                                U->getNnz(),
-                                               U->getRowData("cuda"),
-                                               U->getColData("cuda"),
-                                               U->getValues("cuda"),
+                                               U->getRowData(memory::DEVICE),
+                                               U->getColData(memory::DEVICE),
+                                               U->getValues( memory::DEVICE),
                                                d_P_,
                                                d_Q_,
                                                handle_cusolverrf_);
@@ -76,9 +76,9 @@ namespace ReSolve
     int error_sum = 0;
     status_cusolverrf_ = cusolverRfResetValues(A_->getNumRows(), 
                                                A_->getNnzExpanded(), 
-                                               A_->getRowData("cuda"), //dia_,
-                                               A_->getColData("cuda"), //dja_,
-                                               A_->getValues("cuda"),  //da_,
+                                               A_->getRowData(memory::DEVICE), //dia_,
+                                               A_->getColData(memory::DEVICE), //dja_,
+                                               A_->getValues( memory::DEVICE), //da_,
                                                d_P_,
                                                d_Q_,
                                                handle_cusolverrf_);
@@ -100,22 +100,22 @@ namespace ReSolve
                                           1,
                                           d_T_,
                                           A_->getNumRows(),
-                                          rhs->getData("cuda"),
+                                          rhs->getData(memory::DEVICE),
                                           A_->getNumRows());
     return status_cusolverrf_;
   }
 
   int LinSolverDirectCuSolverRf::solve(vector_type* rhs, vector_type* x)
   {
-    x->update(rhs->getData("cuda"), "cuda", "cuda");
-    x->setDataUpdated("cuda");
+    x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE);
+    x->setDataUpdated(memory::DEVICE);
     status_cusolverrf_ =  cusolverRfSolve(handle_cusolverrf_,
                                           d_P_,
                                           d_Q_,
                                           1,
                                           d_T_,
                                           A_->getNumRows(),
-                                          x->getData("cuda"),
+                                          x->getData(memory::DEVICE),
                                           A_->getNumRows());
     return status_cusolverrf_;
   }
diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp
index 43b612b3..6af27d10 100644
--- a/resolve/LinSolverDirectKLU.cpp
+++ b/resolve/LinSolverDirectKLU.cpp
@@ -35,7 +35,7 @@ namespace ReSolve
 
   int LinSolverDirectKLU::analyze() 
   {
-    Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData("cpu"), A_->getColData("cpu"), &Common_) ;
+    Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData(memory::HOST), A_->getColData(memory::HOST), &Common_) ;
 
     if (Symbolic_ == nullptr){
       printf("Symbolic_ factorization crashed withCommon_.status = %d \n", Common_.status);
@@ -46,7 +46,7 @@ namespace ReSolve
 
   int LinSolverDirectKLU::factorize() 
   {
-    Numeric_ = klu_factor(A_->getRowData("cpu"), A_->getColData("cpu"),A_->getValues("cpu"), Symbolic_, &Common_);
+    Numeric_ = klu_factor(A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, &Common_);
 
     if (Numeric_ == nullptr){
       return 1;
@@ -56,7 +56,7 @@ namespace ReSolve
 
   int  LinSolverDirectKLU::refactorize() 
   {
-    int kluStatus = klu_refactor (A_->getRowData("cpu"), A_->getColData("cpu"), A_->getValues("cpu"), Symbolic_, Numeric_, &Common_);
+    int kluStatus = klu_refactor (A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, Numeric_, &Common_);
 
     if (!kluStatus){
       //display error
@@ -71,10 +71,10 @@ namespace ReSolve
 
     //  std::memcpy(x, rhs, A->getNumRows() * sizeof(real_type));
 
-    x->update(rhs->getData("cpu"), "cpu", "cpu");
-    x->setDataUpdated("cpu");
+    x->update(rhs->getData(memory::HOST), memory::HOST, memory::HOST);
+    x->setDataUpdated(memory::HOST);
 
-    int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData("cpu"), &Common_);
+    int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData(memory::HOST), &Common_);
 
     if (!kluStatus){
       return 1;
@@ -90,16 +90,16 @@ namespace ReSolve
 
       L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL);
       U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU);
-      L_->allocateMatrixData("cpu");
-      U_->allocateMatrixData("cpu");
+      L_->allocateMatrixData(memory::HOST);
+      U_->allocateMatrixData(memory::HOST);
       int ok = klu_extract(Numeric_, 
                            Symbolic_, 
-                           L_->getColData("cpu"), 
-                           L_->getRowData("cpu"), 
-                           L_->getValues("cpu"), 
-                           U_->getColData("cpu"), 
-                           U_->getRowData("cpu"), 
-                           U_->getValues("cpu"), 
+                           L_->getColData(memory::HOST), 
+                           L_->getRowData(memory::HOST), 
+                           L_->getValues( memory::HOST), 
+                           U_->getColData(memory::HOST), 
+                           U_->getRowData(memory::HOST), 
+                           U_->getValues( memory::HOST), 
                            nullptr, 
                            nullptr, 
                            nullptr, 
@@ -109,8 +109,8 @@ namespace ReSolve
                            nullptr,
                            &Common_);
 
-      L_->setUpdated("cpu");
-      U_->setUpdated("cpu");
+      L_->setUpdated(memory::HOST);
+      U_->setUpdated(memory::HOST);
       (void) ok; // TODO: Check status in ok before setting `factors_extracted_`
       factors_extracted_ = true;
     }
@@ -125,16 +125,16 @@ namespace ReSolve
 
       L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL);
       U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU);
-      L_->allocateMatrixData("cpu");
-      U_->allocateMatrixData("cpu");
+      L_->allocateMatrixData(memory::HOST);
+      U_->allocateMatrixData(memory::HOST);
       int ok = klu_extract(Numeric_, 
                            Symbolic_, 
-                           L_->getColData("cpu"), 
-                           L_->getRowData("cpu"), 
-                           L_->getValues("cpu"), 
-                           U_->getColData("cpu"), 
-                           U_->getRowData("cpu"), 
-                           U_->getValues("cpu"), 
+                           L_->getColData(memory::HOST), 
+                           L_->getRowData(memory::HOST), 
+                           L_->getValues( memory::HOST), 
+                           U_->getColData(memory::HOST), 
+                           U_->getRowData(memory::HOST), 
+                           U_->getValues( memory::HOST), 
                            nullptr, 
                            nullptr, 
                            nullptr, 
@@ -144,8 +144,8 @@ namespace ReSolve
                            nullptr,
                            &Common_);
 
-      L_->setUpdated("cpu");
-      U_->setUpdated("cpu");
+      L_->setUpdated(memory::HOST);
+      U_->setUpdated(memory::HOST);
 
       (void) ok; // TODO: Check status in ok before setting `factors_extracted_`
       factors_extracted_ = true;
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index fa63f2d5..0bf1720f 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -82,9 +82,9 @@ namespace ReSolve
     n_ = A_->getNumRows();
 
     d_V_ = new vector_type(n_, restart_ + 1);
-    d_V_->allocate("cuda");      
+    d_V_->allocate(memory::DEVICE);      
     d_Z_ = new vector_type(n_, restart_ + 1);
-    d_Z_->allocate("cuda");      
+    d_Z_->allocate(memory::DEVICE);      
     h_H_  = new real_type[restart_ * (restart_ + 1)];
     h_c_  = new real_type[restart_];      // needed for givens
     h_s_  = new real_type[restart_];      // same
@@ -114,7 +114,7 @@ namespace ReSolve
     vector_type* vec_z = new vector_type(n_);
     //V[0] = b-A*x_0
 
-    rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda");  
+    rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
     matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", "cuda"); 
     rnorm = 0.0;
     bnorm = vector_handler_->dot(rhs, rhs, "cuda");
@@ -166,14 +166,14 @@ namespace ReSolve
 
         // Z_i = (LU)^{-1}*V_i
 
-        vec_v->setData( d_V_->getVectorData(i, "cuda"), "cuda");
-        vec_z->setData( d_Z_->getVectorData(i, "cuda"), "cuda");
+        vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE);
+        vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE);
         this->precV(vec_v, vec_z);
         mem_.deviceSynchronize();
 
         // V_{i+1}=A*Z_i
 
-        vec_v->setData( d_V_->getVectorData(i + 1, "cuda"), "cuda");
+        vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
 
         matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", "cuda"); 
 
@@ -228,7 +228,7 @@ namespace ReSolve
 
       // get solution
       for(j = 0; j <= i; j++) {
-        vec_z->setData( d_Z_->getVectorData(j, "cuda"), "cuda");
+        vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE);
         vector_handler_->axpy(&h_rs_[j], vec_z, x, "cuda");
       }
 
@@ -239,7 +239,7 @@ namespace ReSolve
         outer_flag = 0;
       }
 
-      rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda");  
+      rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
       matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", "cuda"); 
       rnorm = vector_handler_->dot(d_V_, d_V_, "cuda");
       // rnorm = ||V_1||
@@ -317,7 +317,7 @@ namespace ReSolve
   void  LinSolverIterativeFGMRES::precV(vector_type* rhs, vector_type* x)
   { 
     LU_solver_->solve(rhs, x);
-    //  x->update(rhs->getData("cuda"), "cuda", "cuda");
+    //  x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE);
   }
 
   real_type LinSolverIterativeFGMRES::getFinalResidualNorm()
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
index 5e2da403..d87c621f 100644
--- a/resolve/MemoryUtils.hpp
+++ b/resolve/MemoryUtils.hpp
@@ -2,6 +2,16 @@
 
 #include <resolve/resolve_defs.hpp>
 
+
+namespace ReSolve
+{
+  namespace memory
+  {
+    enum MemorySpace{HOST = 0, DEVICE};
+    enum MemoryDirection{HOST_TO_HOST = 0, HOST_TO_DEVICE, DEVICE_TO_HOST, DEVICE_TO_DEVICE};
+  }
+}
+
 namespace ReSolve
 {
   /**
diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp
index eeff7b86..326eba59 100644
--- a/resolve/matrix/Coo.cpp
+++ b/resolve/matrix/Coo.cpp
@@ -27,52 +27,49 @@ namespace ReSolve
   {
   }
 
-  index_type* matrix::Coo::getRowData(std::string memspace)
+  index_type* matrix::Coo::getRowData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_row_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_row_data_;
+      case DEVICE:
         return this->d_row_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  index_type* matrix::Coo::getColData(std::string memspace)
+  index_type* matrix::Coo::getColData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_col_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_col_data_;
+      case DEVICE:
         return this->d_col_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  real_type* matrix::Coo::getValues(std::string memspace)
+  real_type* matrix::Coo::getValues(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_val_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_val_data_;
+      case DEVICE:
         return this->d_val_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut)
+  index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
 
     //four cases (for now)
@@ -80,12 +77,12 @@ namespace ReSolve
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     setNotUpdated();
     int control=-1;
-    if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
+    if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)){ control = 0;}
+    if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;}
+    if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)){ control = 2;}
+    if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;}
 
-    if (memspaceOut == "cpu") {
+    if (memspaceOut == memory::HOST) {
       //check if cpu data allocated	
       if (h_row_data_ == nullptr) {
         this->h_row_data_ = new index_type[nnz_current];
@@ -98,7 +95,7 @@ namespace ReSolve
       }
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
+    if (memspaceOut == memory::DEVICE) {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
@@ -150,7 +147,7 @@ namespace ReSolve
     return 0;
   } 
 
-  index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut)
+  index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     this->destroyMatrixData(memspaceOut);
     this->nnz_ = new_nnz;
@@ -158,13 +155,13 @@ namespace ReSolve
     return i;
   } 
 
-  index_type matrix::Coo::allocateMatrixData(std::string memspace)
+  index_type matrix::Coo::allocateMatrixData(memory::MemorySpace memspace)
   {
     index_type nnz_current = nnz_;
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     destroyMatrixData(memspace);//just in case
 
-    if (memspace == "cpu") {
+    if (memspace == memory::HOST) {
       this->h_row_data_ = new index_type[nnz_current];
       std::fill(h_row_data_, h_row_data_ + nnz_current, 0);  
       this->h_col_data_ = new index_type[nnz_current];
@@ -176,7 +173,7 @@ namespace ReSolve
       return 0;
     }
 
-    if ((memspace == "cuda") || (memspace == "hip")) {
+    if (memspace == memory::DEVICE) {
       mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -187,55 +184,57 @@ namespace ReSolve
     return -1;
   }
 
-  int matrix::Coo::copyData(std::string memspaceOut)
+  int matrix::Coo::copyData(memory::MemorySpace memspaceOut)
   {
+    using namespace ReSolve::memory;
 
     index_type nnz_current = nnz_;
-    if (is_expanded_) {nnz_current = nnz_expanded_;}
-
-    if (memspaceOut == "cpu") {
-      //check if we need to copy or not
-      if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
-        if (h_row_data_ == nullptr) {
-          h_row_data_ = new index_type[nnz_current];      
-        }
-        if (h_col_data_ == nullptr) {
-          h_col_data_ = new index_type[nnz_current];      
-        }
-        if (h_val_data_ == nullptr) {
-          h_val_data_ = new real_type[nnz_current];      
-        }
-        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
-        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
-        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
-        h_data_updated_ = true;
-        owns_cpu_data_ = true;
-        owns_cpu_vals_ = true;
-      }
-      return 0;
+    if (is_expanded_) {
+      nnz_current = nnz_expanded_;
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
-      if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
-        if (d_row_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
-        }
-        if (d_col_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
+    switch (memspaceOut) {
+      case HOST:
+        if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
+          if (h_row_data_ == nullptr) {
+            h_row_data_ = new index_type[nnz_current];      
+          }
+          if (h_col_data_ == nullptr) {
+            h_col_data_ = new index_type[nnz_current];      
+          }
+          if (h_val_data_ == nullptr) {
+            h_val_data_ = new real_type[nnz_current];      
+          }
+          mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
+          mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
+          mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+          h_data_updated_ = true;
+          owns_cpu_data_ = true;
+          owns_cpu_vals_ = true;
         }
-        if (d_val_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current);
+        return 0;
+      case DEVICE:
+        if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
+          if (d_row_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
+          }
+          if (d_col_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
+          }
+          if (d_val_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_val_data_, nnz_current);
+          }
+          mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
+          mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
+          mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+          d_data_updated_ = true;
+          owns_gpu_data_ = true;
+          owns_gpu_vals_ = true;
         }
-        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
-        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
-        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
-        d_data_updated_ = true;
-        owns_gpu_data_ = true;
-        owns_gpu_vals_ = true;
-      }
-      return 0;
-    }
-    return -1;
+        return 0;
+      default:
+        return -1;
+    } // switch
   }
 
   void matrix::Coo::print()
diff --git a/resolve/matrix/Coo.hpp b/resolve/matrix/Coo.hpp
index 3ec045c3..bc67ceef 100644
--- a/resolve/matrix/Coo.hpp
+++ b/resolve/matrix/Coo.hpp
@@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix {
                 bool expanded);
       ~Coo();
 
-      virtual index_type* getRowData(std::string memspace);
-      virtual index_type* getColData(std::string memspace);
-      virtual real_type* getValues(std::string memspace); 
+      virtual index_type* getRowData(memory::MemorySpace memspace);
+      virtual index_type* getColData(memory::MemorySpace memspace);
+      virtual real_type*  getValues( memory::MemorySpace memspace); 
 
-      virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); 
-      virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); 
+      virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
+      virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
 
-      virtual index_type allocateMatrixData(std::string memspace);
+      virtual index_type allocateMatrixData(memory::MemorySpace memspace);
 
       virtual void print();
 
-      virtual int copyData(std::string memspaceOut);
+      virtual int copyData(memory::MemorySpace memspaceOut);
   };
 
 }} // namespace ReSolve::matrix
diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp
index f6358df3..e6fed07c 100644
--- a/resolve/matrix/Csc.cpp
+++ b/resolve/matrix/Csc.cpp
@@ -24,64 +24,61 @@ namespace ReSolve
   {
   }
 
-  index_type* matrix::Csc::getRowData(std::string memspace)
+  index_type* matrix::Csc::getRowData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_row_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_row_data_;
+      case DEVICE:
         return this->d_row_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  index_type* matrix::Csc::getColData(std::string memspace)
+  index_type* matrix::Csc::getColData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_col_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_col_data_;
+      case DEVICE:
         return this->d_col_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  real_type* matrix::Csc::getValues(std::string memspace)
+  real_type* matrix::Csc::getValues(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_val_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_val_data_;
+      case DEVICE:
         return this->d_val_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut)
+  int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     index_type nnz_current = nnz_;
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     //four cases (for now)
     int control=-1;
     setNotUpdated();
-    if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
+    if ((memspaceIn == memory::HOST)     && (memspaceOut == memory::HOST))    { control = 0;}
+    if ((memspaceIn == memory::HOST)     && ((memspaceOut == memory::DEVICE))){ control = 1;}
+    if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST))    { control = 2;}
+    if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;}
 
-    if (memspaceOut == "cpu") {
+    if (memspaceOut == memory::HOST) {
       //check if cpu data allocated
       if (h_col_data_ == nullptr) {
         this->h_col_data_ = new index_type[n_ + 1];
@@ -94,7 +91,7 @@ namespace ReSolve
       }
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
+    if (memspaceOut == memory::DEVICE) {
       //check if cuda data allocated
       if (d_col_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
@@ -147,7 +144,7 @@ namespace ReSolve
 
   } 
 
-  int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut)
+  int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     this->destroyMatrixData(memspaceOut);
     this->nnz_ = new_nnz;
@@ -155,13 +152,13 @@ namespace ReSolve
     return i;
   }
 
-  int matrix::Csc::allocateMatrixData(std::string memspace)
+  int matrix::Csc::allocateMatrixData(memory::MemorySpace memspace)
   {
     index_type nnz_current = nnz_;
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     destroyMatrixData(memspace);//just in case
 
-    if (memspace == "cpu") {
+    if (memspace == memory::HOST) {
       this->h_col_data_ = new index_type[n_ + 1];
       std::fill(h_col_data_, h_col_data_ + n_ + 1, 0);  
       this->h_row_data_ = new index_type[nnz_current];
@@ -173,7 +170,7 @@ namespace ReSolve
       return 0;
     }
 
-    if ((memspace == "cuda") || (memspace == "hip")) {
+    if (memspace == memory::DEVICE) {
       mem_.allocateArrayOnDevice(&d_col_data_,      n_ + 1); 
       mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -184,54 +181,56 @@ namespace ReSolve
     return -1;
   }
 
-  int matrix::Csc::copyData(std::string memspaceOut)
+  int matrix::Csc::copyData(memory::MemorySpace memspaceOut)
   {
+    using namespace ReSolve::memory;
 
     index_type nnz_current = nnz_;
-    if (is_expanded_) {nnz_current = nnz_expanded_;}
-
-    if (memspaceOut == "cpu") {
-      //check if we need to copy or not
-      if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
-        if (h_col_data_ == nullptr) {
-          h_col_data_ = new index_type[n_ + 1];      
-        }
-        if (h_row_data_ == nullptr) {
-          h_row_data_ = new index_type[nnz_current];      
-        }
-        if (h_val_data_ == nullptr) {
-          h_val_data_ = new real_type[nnz_current];      
-        }
-        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_,      n_ + 1);
-        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
-        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
-        h_data_updated_ = true;
-        owns_cpu_data_ = true;
-        owns_cpu_vals_ = true;
-      }
-      return 0;   
+    if (is_expanded_) {
+      nnz_current = nnz_expanded_;
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
-      if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
-        if (d_col_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
+    switch(memspaceOut) {
+      case HOST:
+        if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
+          if (h_col_data_ == nullptr) {
+            h_col_data_ = new index_type[n_ + 1];      
+          }
+          if (h_row_data_ == nullptr) {
+            h_row_data_ = new index_type[nnz_current];      
+          }
+          if (h_val_data_ == nullptr) {
+            h_val_data_ = new real_type[nnz_current];      
+          }
+          mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_,      n_ + 1);
+          mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
+          mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+          h_data_updated_ = true;
+          owns_cpu_data_ = true;
+          owns_cpu_vals_ = true;
         }
-        if (d_row_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
+        return 0;   
+      case DEVICE:
+        if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
+          if (d_col_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
+          }
+          if (d_row_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
+          }
+          if (d_val_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
+          }
+          mem_.copyArrayHostToDevice(d_col_data_, h_col_data_,      n_ + 1);
+          mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
+          mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+          d_data_updated_ = true;
+          owns_gpu_data_ = true;
+          owns_gpu_vals_ = true;
         }
-        if (d_val_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
-        }
-        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_,      n_ + 1);
-        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
-        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
-        d_data_updated_ = true;
-        owns_gpu_data_ = true;
-        owns_gpu_vals_ = true;
-      }
-      return 0; 
-    }
-    return -1;
+        return 0;
+      default:
+        return -1;
+    } // switch
   }
 }
diff --git a/resolve/matrix/Csc.hpp b/resolve/matrix/Csc.hpp
index f0598314..8a5dc551 100644
--- a/resolve/matrix/Csc.hpp
+++ b/resolve/matrix/Csc.hpp
@@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix {
                 bool expanded);
       ~Csc();
 
-      virtual index_type* getRowData(std::string memspace);
-      virtual index_type* getColData(std::string memspace);
-      virtual real_type*  getValues(std::string memspace); 
+      virtual index_type* getRowData(memory::MemorySpace memspace);
+      virtual index_type* getColData(memory::MemorySpace memspace);
+      virtual real_type*  getValues( memory::MemorySpace memspace); 
 
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); 
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); 
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
 
-      virtual int allocateMatrixData(std::string memspace);
+      virtual int allocateMatrixData(memory::MemorySpace memspace);
 
       virtual void print() {return;}
 
-      virtual int copyData(std::string memspaceOut);
+      virtual int copyData(memory::MemorySpace memspaceOut);
 
   };
 
diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp
index 04e8dff1..0c08b641 100644
--- a/resolve/matrix/Csr.cpp
+++ b/resolve/matrix/Csr.cpp
@@ -24,64 +24,61 @@ namespace ReSolve
   {
   }
 
-  index_type* matrix::Csr::getRowData(std::string memspace)
+  index_type* matrix::Csr::getRowData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_row_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_row_data_;
+      case DEVICE:
         return this->d_row_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  index_type* matrix::Csr::getColData(std::string memspace)
+  index_type* matrix::Csr::getColData(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_col_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_col_data_;
+      case DEVICE:
         return this->d_col_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  real_type* matrix::Csr::getValues(std::string memspace)
+  real_type* matrix::Csr::getValues(memory::MemorySpace memspace)
   {
-    if (memspace == "cpu") {
-      copyData("cpu");
-      return this->h_val_data_;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
-        copyData(memspace);
+    using namespace ReSolve::memory;
+    copyData(memspace);
+    switch (memspace) {
+      case HOST:
+        return this->h_val_data_;
+      case DEVICE:
         return this->d_val_data_;
-      } else {
+      default:
         return nullptr;
-      }
     }
   }
 
-  int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut)
+  int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     //four cases (for now)
     index_type nnz_current = nnz_;
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     setNotUpdated();
     int control = -1;
-    if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
+    if ((memspaceIn == memory::HOST)     && (memspaceOut == memory::HOST))    { control = 0;}
+    if ((memspaceIn == memory::HOST)     && ((memspaceOut == memory::DEVICE))){ control = 1;}
+    if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST))    { control = 2;}
+    if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;}
 
-    if (memspaceOut == "cpu") {
+    if (memspaceOut == memory::HOST) {
       //check if cpu data allocated
       if (h_row_data_ == nullptr) {
         this->h_row_data_ = new index_type[n_ + 1];
@@ -94,7 +91,7 @@ namespace ReSolve
       }
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
+    if (memspaceOut == memory::DEVICE) {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
@@ -148,7 +145,7 @@ namespace ReSolve
     return 0;
   } 
 
-  int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut)
+  int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     this->destroyMatrixData(memspaceOut);
     this->nnz_ = new_nnz;
@@ -156,13 +153,13 @@ namespace ReSolve
     return i;
   } 
 
-  int matrix::Csr::allocateMatrixData(std::string memspace)
+  int matrix::Csr::allocateMatrixData(memory::MemorySpace memspace)
   {
     index_type nnz_current = nnz_;
     if (is_expanded_) {nnz_current = nnz_expanded_;}
     destroyMatrixData(memspace);//just in case
 
-    if (memspace == "cpu") {
+    if (memspace == memory::HOST) {
       this->h_row_data_ = new index_type[n_ + 1];
       std::fill(h_row_data_, h_row_data_ + n_ + 1, 0);  
       this->h_col_data_ = new index_type[nnz_current];
@@ -174,7 +171,7 @@ namespace ReSolve
       return 0;   
     }
 
-    if ((memspace == "cuda") || (memspace == "hip")) {
+    if (memspace == memory::DEVICE) {
       mem_.allocateArrayOnDevice(&d_row_data_,      n_ + 1); 
       mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
       mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -185,54 +182,58 @@ namespace ReSolve
     return -1;
   }
 
-  int matrix::Csr::copyData(std::string memspaceOut)
+  int matrix::Csr::copyData(memory::MemorySpace memspaceOut)
   {
-    index_type nnz_current = nnz_;
-    if (is_expanded_) {nnz_current = nnz_expanded_;}
+    using namespace ReSolve::memory;
 
-    if (memspaceOut == "cpu") {
-      //check if we need to copy or not
-      if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
-        if (h_row_data_ == nullptr) {
-          h_row_data_ = new index_type[n_ + 1];      
-        }
-        if (h_col_data_ == nullptr) {
-          h_col_data_ = new index_type[nnz_current];      
-        }
-        if (h_val_data_ == nullptr) {
-          h_val_data_ = new real_type[nnz_current];      
-        }
-        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_,      n_ + 1);
-        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
-        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
-        h_data_updated_ = true;
-        owns_cpu_data_ = true;
-        owns_cpu_vals_ = true;
-      }
-      return 0;
+    index_type nnz_current = nnz_;
+    if (is_expanded_) {
+      nnz_current = nnz_expanded_;
     }
 
-    if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
-      if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
-        if (d_row_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
-        }
-        if (d_col_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
+    switch (memspaceOut) {
+      case HOST:
+        //check if we need to copy or not
+        if ((d_data_updated_ == true) && (h_data_updated_ == false)) {
+          if (h_row_data_ == nullptr) {
+            h_row_data_ = new index_type[n_ + 1];      
+          }
+          if (h_col_data_ == nullptr) {
+            h_col_data_ = new index_type[nnz_current];      
+          }
+          if (h_val_data_ == nullptr) {
+            h_val_data_ = new real_type[nnz_current];      
+          }
+          mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_,      n_ + 1);
+          mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
+          mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+          h_data_updated_ = true;
+          owns_cpu_data_ = true;
+          owns_cpu_vals_ = true;
         }
-        if (d_val_data_ == nullptr) {
-          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
+        return 0;
+      case DEVICE:
+        if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
+          if (d_row_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
+          }
+          if (d_col_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
+          }
+          if (d_val_data_ == nullptr) {
+            mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
+          }
+          mem_.copyArrayHostToDevice(d_row_data_, h_row_data_,      n_ + 1);
+          mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
+          mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+          d_data_updated_ = true;
+          owns_gpu_data_ = true;
+          owns_gpu_vals_ = true;
         }
-        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_,      n_ + 1);
-        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
-        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
-        d_data_updated_ = true;
-        owns_gpu_data_ = true;
-        owns_gpu_vals_ = true;
-      }
-      return 0;
-    }
-  return -1;  
+        return 0;
+      default:
+        return -1;
+    } // switch
 }
 
 } // namespace ReSolve 
diff --git a/resolve/matrix/Csr.hpp b/resolve/matrix/Csr.hpp
index 43c317de..a5d8f682 100644
--- a/resolve/matrix/Csr.hpp
+++ b/resolve/matrix/Csr.hpp
@@ -18,18 +18,18 @@ namespace ReSolve { namespace matrix {
 
       ~Csr();
 
-      virtual index_type* getRowData(std::string memspace);
-      virtual index_type* getColData(std::string memspace);
-      virtual real_type*  getValues(std::string memspace); 
+      virtual index_type* getRowData(memory::MemorySpace memspace);
+      virtual index_type* getColData(memory::MemorySpace memspace);
+      virtual real_type*  getValues( memory::MemorySpace memspace); 
 
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); 
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); 
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
 
-      virtual int allocateMatrixData(std::string memspace); 
+      virtual int allocateMatrixData(memory::MemorySpace memspace); 
 
       virtual void print() {return;}
 
-      virtual int copyData(std::string memspaceOut);
+      virtual int copyData(memory::MemorySpace memspaceOut);
   };
 
 }} // namespace ReSolve::matrix
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 133a09f9..0a7124da 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -124,9 +124,9 @@ namespace ReSolve {
 
     index_type* nnz_counts =  new index_type[n];
     std::fill_n(nnz_counts, n, 0);
-    index_type* coo_rows = A_coo->getRowData("cpu");
-    index_type* coo_cols = A_coo->getColData("cpu");
-    real_type* coo_vals = A_coo->getValues("cpu");
+    index_type* coo_rows = A_coo->getRowData(memory::HOST);
+    index_type* coo_cols = A_coo->getColData(memory::HOST);
+    real_type* coo_vals  = A_coo->getValues( memory::HOST);
 
     index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal
     std::fill_n(diag_control, n, 0);
@@ -249,12 +249,12 @@ namespace ReSolve {
 #endif
     A_csr->setNnz(nnz_no_duplicates);
     if (memspace == "cpu"){
-      A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu");
+      A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::HOST);
     } else {
       if (memspace == "cuda"){      
-        A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
+        A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE);
       } else if (memspace == "hip"){      
-        A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda");
+        A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE);
       } else {
         //display error
       }
diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp
index 2c434dcb..d4799ffd 100644
--- a/resolve/matrix/MatrixHandlerCpu.cpp
+++ b/resolve/matrix/MatrixHandlerCpu.cpp
@@ -45,12 +45,12 @@ namespace ReSolve {
     // int error_sum = 0;
     if (matrixFormat == "csr") {
       matrix::Csr* A = (matrix::Csr*) Ageneric;
-      index_type* ia = A->getRowData("cpu");
-      index_type* ja = A->getColData("cpu");
-      real_type*   a = A->getValues("cpu");
+      index_type* ia = A->getRowData(memory::HOST);
+      index_type* ja = A->getColData(memory::HOST);
+      real_type*   a = A->getValues( memory::HOST);
 
-      real_type* x_data      = vec_x->getData("cpu");
-      real_type* result_data = vec_result->getData("cpu");
+      real_type* x_data      = vec_x->getData(memory::HOST);
+      real_type* result_data = vec_result->getData(memory::HOST);
       real_type sum;
       real_type y;
       real_type t;
@@ -70,7 +70,7 @@ namespace ReSolve {
         sum *= (*alpha);
         result_data[i] = result_data[i]*(*beta) + sum;
       } 
-      vec_result->setDataUpdated("cpu");
+      vec_result->setDataUpdated(memory::HOST);
       return 0;
     } else {
       out::error() << "MatVec not implemented (yet) for " 
@@ -100,13 +100,13 @@ namespace ReSolve {
     index_type nnz = A_csc->getNnz();
     index_type n   = A_csc->getNumColumns();
 
-    index_type* rowIdxCsc = A_csc->getRowData("cpu");
-    index_type* colPtrCsc = A_csc->getColData("cpu");
-    real_type*  valuesCsc = A_csc->getValues("cpu");
+    index_type* rowIdxCsc = A_csc->getRowData(memory::HOST);
+    index_type* colPtrCsc = A_csc->getColData(memory::HOST);
+    real_type*  valuesCsc = A_csc->getValues( memory::HOST);
 
-    index_type* rowPtrCsr = A_csr->getRowData("cpu");
-    index_type* colIdxCsr = A_csr->getColData("cpu");
-    real_type*  valuesCsr = A_csr->getValues("cpu");
+    index_type* rowPtrCsr = A_csr->getRowData(memory::HOST);
+    index_type* colIdxCsr = A_csr->getColData(memory::HOST);
+    real_type*  valuesCsr = A_csr->getValues( memory::HOST);
 
     // Set all CSR row pointers to zero
     for (index_type i = 0; i <= n; ++i) {
diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp
index 3405ba8d..e0ac7bb4 100644
--- a/resolve/matrix/MatrixHandlerCuda.cpp
+++ b/resolve/matrix/MatrixHandlerCuda.cpp
@@ -42,11 +42,11 @@ namespace ReSolve {
       cusparseStatus_t status;
       LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
       cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX();
-      cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F);
+      cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData(memory::DEVICE), CUDA_R_64F);
 
 
       cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY();
-      cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F);
+      cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData(memory::DEVICE), CUDA_R_64F);
 
       cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor();
 
@@ -57,9 +57,9 @@ namespace ReSolve {
                                    A->getNumRows(),
                                    A->getNumColumns(),
                                    A->getNnzExpanded(),
-                                   A->getRowData("cuda"),
-                                   A->getColData("cuda"),
-                                   A->getValues("cuda"), 
+                                   A->getRowData(memory::DEVICE),
+                                   A->getColData(memory::DEVICE),
+                                   A->getValues( memory::DEVICE), 
                                    CUSPARSE_INDEX_32I, 
                                    CUSPARSE_INDEX_32I,
                                    CUSPARSE_INDEX_BASE_ZERO,
@@ -105,7 +105,7 @@ namespace ReSolve {
       if (status)
         out::error() << "Matvec status: " << status 
                       << "Last error code: " << mem_.getLastDeviceError() << std::endl;
-      vec_result->setDataUpdated("cuda");
+      vec_result->setDataUpdated(memory::DEVICE);
 
       cusparseDestroyDnVec(vecx);
       cusparseDestroyDnVec(vecAx);
@@ -127,7 +127,7 @@ namespace ReSolve {
     index_type error_sum = 0;
     LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
 
-    A_csr->allocateMatrixData("cuda");
+    A_csr->allocateMatrixData(memory::DEVICE);
     index_type n = A_csc->getNumRows();
     index_type m = A_csc->getNumRows();
     index_type nnz = A_csc->getNnz();
@@ -137,12 +137,12 @@ namespace ReSolve {
                                                             n, 
                                                             m, 
                                                             nnz, 
-                                                            A_csc->getValues("cuda"), 
-                                                            A_csc->getColData("cuda"), 
-                                                            A_csc->getRowData("cuda"), 
-                                                            A_csr->getValues("cuda"), 
-                                                            A_csr->getRowData("cuda"),
-                                                            A_csr->getColData("cuda"), 
+                                                            A_csc->getValues( memory::DEVICE), 
+                                                            A_csc->getColData(memory::DEVICE), 
+                                                            A_csc->getRowData(memory::DEVICE), 
+                                                            A_csr->getValues( memory::DEVICE), 
+                                                            A_csr->getRowData(memory::DEVICE),
+                                                            A_csr->getColData(memory::DEVICE), 
                                                             CUDA_R_64F, 
                                                             CUSPARSE_ACTION_NUMERIC,
                                                             CUSPARSE_INDEX_BASE_ZERO, 
@@ -154,12 +154,12 @@ namespace ReSolve {
                                 n, 
                                 m, 
                                 nnz, 
-                                A_csc->getValues("cuda"), 
-                                A_csc->getColData("cuda"), 
-                                A_csc->getRowData("cuda"), 
-                                A_csr->getValues("cuda"), 
-                                A_csr->getRowData("cuda"),
-                                A_csr->getColData("cuda"), 
+                                A_csc->getValues( memory::DEVICE), 
+                                A_csc->getColData(memory::DEVICE), 
+                                A_csc->getRowData(memory::DEVICE), 
+                                A_csr->getValues( memory::DEVICE), 
+                                A_csr->getRowData(memory::DEVICE),
+                                A_csr->getColData(memory::DEVICE), 
                                 CUDA_R_64F,
                                 CUSPARSE_ACTION_NUMERIC,
                                 CUSPARSE_INDEX_BASE_ZERO,
diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp
index 370849fa..b4f8e483 100644
--- a/resolve/matrix/MatrixHandlerHip.cpp
+++ b/resolve/matrix/MatrixHandlerHip.cpp
@@ -62,9 +62,9 @@ namespace ReSolve {
                                            A->getNumColumns(),
                                            A->getNnzExpanded(), 
                                            descrA,
-                                           A->getValues("cuda"), 
-                                           A->getRowData("cuda"),
-                                           A->getColData("cuda"), // cuda is used as "device"
+                                           A->getValues( memory::DEVICE), 
+                                           A->getRowData(memory::DEVICE),
+                                           A->getColData(memory::DEVICE), // cuda is used as "device"
                                            infoA);
         error_sum += status;
         mem_.deviceSynchronize();
@@ -79,20 +79,20 @@ namespace ReSolve {
                                 A->getNnzExpanded(),
                                 alpha, 
                                 descrA,
-                                A->getValues("cuda"), 
-                                A->getRowData("cuda"),
-                                A->getColData("cuda"),
+                                A->getValues( memory::DEVICE), 
+                                A->getRowData(memory::DEVICE),
+                                A->getColData(memory::DEVICE),
                                 infoA,
-                                vec_x->getData("cuda"),
+                                vec_x->getData(memory::DEVICE),
                                 beta,
-                                vec_result->getData("cuda"));
+                                vec_result->getData(memory::DEVICE));
 
       error_sum += status;
       mem_.deviceSynchronize();
       if (status)
         out::error() << "Matvec status: " << status 
                       << "Last error code: " << mem_.getLastDeviceError() << std::endl;
-      vec_result->setDataUpdated("cuda");
+      vec_result->setDataUpdated(memory::DEVICE);
 
       return error_sum;
     } else {
@@ -114,7 +114,7 @@ namespace ReSolve {
 
     rocsparse_status status;
     
-    A_csr->allocateMatrixData("cuda");
+    A_csr->allocateMatrixData(memory::DEVICE);
     index_type n = A_csc->getNumRows();
     index_type m = A_csc->getNumRows();
     index_type nnz = A_csc->getNnz();
@@ -125,8 +125,8 @@ namespace ReSolve {
                                            n,
                                            m,
                                            nnz,
-                                           A_csc->getColData("cuda"), 
-                                           A_csc->getRowData("cuda"), 
+                                           A_csc->getColData(memory::DEVICE), 
+                                           A_csc->getRowData(memory::DEVICE), 
                                            rocsparse_action_numeric,
                                            &bufferSize);
 
@@ -137,12 +137,12 @@ namespace ReSolve {
                                 n,
                                 m,
                                 nnz,
-                                A_csc->getValues("cuda"), 
-                                A_csc->getColData("cuda"), 
-                                A_csc->getRowData("cuda"), 
-                                A_csr->getValues("cuda"), 
-                                A_csr->getRowData("cuda"),
-                                A_csr->getColData("cuda"), 
+                                A_csc->getValues( memory::DEVICE), 
+                                A_csc->getColData(memory::DEVICE), 
+                                A_csc->getRowData(memory::DEVICE), 
+                                A_csr->getValues( memory::DEVICE), 
+                                A_csr->getRowData(memory::DEVICE),
+                                A_csr->getColData(memory::DEVICE), 
                                 rocsparse_action_numeric,
                                 rocsparse_index_base_zero,
                                 d_work);
diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp
index 4a16ec98..faa86e11 100644
--- a/resolve/matrix/Sparse.cpp
+++ b/resolve/matrix/Sparse.cpp
@@ -73,8 +73,8 @@ namespace ReSolve { namespace matrix {
 
   Sparse::~Sparse()
   {
-    this->destroyMatrixData("cpu");
-    this->destroyMatrixData("cuda");
+    this->destroyMatrixData(memory::HOST);
+    this->destroyMatrixData(memory::DEVICE);
   }
 
   void Sparse::setNotUpdated()
@@ -133,58 +133,59 @@ namespace ReSolve { namespace matrix {
     this->nnz_ = nnz_new;
   }
 
-  int Sparse::setUpdated(std::string what)
+  int Sparse::setUpdated(memory::MemorySpace memspace)
   {
-    if (what == "cpu")
-    {
-      h_data_updated_ = true;
-      d_data_updated_ = false;
-    } else {
-      if (what == "cuda"){
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        h_data_updated_ = true;
+        d_data_updated_ = false;
+        break;
+      case DEVICE:
         d_data_updated_ = true;
         h_data_updated_ = false;
-      } else {
-        return -1;
-      }
+        break;
     }
     return 0;
   }
 
-  int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace)
+  int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace)
   {
+    using namespace ReSolve::memory;
 
     setNotUpdated();
 
-    if (memspace == "cpu"){
-      this->h_row_data_ = row_data;
-      this->h_col_data_ = col_data;
-      this->h_val_data_ = val_data;	
-      h_data_updated_ = true;
-    } else {
-      if (memspace == "cuda"){ 
+    switch (memspace) {
+      case HOST:
+        this->h_row_data_ = row_data;
+        this->h_col_data_ = col_data;
+        this->h_val_data_ = val_data;	
+        h_data_updated_ = true;
+        break;
+      case DEVICE:
         this->d_row_data_ = row_data;
         this->d_col_data_ = col_data;
         this->d_val_data_ = val_data;	
         d_data_updated_ = true;
-      } else {
-        return -1;
-      }
+        break;
     }
     return 0;
   }
 
-  int Sparse::destroyMatrixData(std::string memspace)
-  { 
-    if (memspace == "cpu"){  
-      if (owns_cpu_data_) {
-        delete [] h_row_data_;
-        delete [] h_col_data_;
-      }
-      if (owns_cpu_vals_) {
-        delete [] h_val_data_;
-      }
-    } else {
-      if (memspace == "cuda"){ 
+  int Sparse::destroyMatrixData(memory::MemorySpace memspace)
+  {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        if (owns_cpu_data_) {
+          delete [] h_row_data_;
+          delete [] h_col_data_;
+        }
+        if (owns_cpu_vals_) {
+          delete [] h_val_data_;
+        }
+        return 0;
+      case DEVICE:
         if (owns_gpu_data_) {
           mem_.deleteOnDevice(d_row_data_);
           mem_.deleteOnDevice(d_col_data_);
@@ -192,14 +193,13 @@ namespace ReSolve { namespace matrix {
         if (owns_gpu_vals_) {
           mem_.deleteOnDevice(d_val_data_);
         }
-      } else {
+        return 0;
+      default:
         return -1;
-      }
     }
-    return 0;
   }
 
-  int Sparse::updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut)
+  int Sparse::updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
  
     index_type nnz_current = nnz_;
@@ -207,19 +207,19 @@ namespace ReSolve { namespace matrix {
     //four cases (for now)
     setNotUpdated();
     int control=-1;
-    if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;}
-    if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;}
+    if ((memspaceIn == memory::HOST)   && (memspaceOut == memory::HOST))  { control = 0;}
+    if ((memspaceIn == memory::HOST)   && (memspaceOut == memory::DEVICE)){ control = 1;}
+    if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST))  { control = 2;}
+    if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;}
    
-    if (memspaceOut == "cpu") {
+    if (memspaceOut == memory::HOST) {
       //check if cpu data allocated
       if (h_val_data_ == nullptr) {
         this->h_val_data_ = new real_type[nnz_current];
       }
     }
 
-    if (memspaceOut == "cuda") {
+    if (memspaceOut == memory::DEVICE) {
       //check if cuda data allocated
       if (d_val_data_ == nullptr) {
         mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
@@ -253,21 +253,22 @@ namespace ReSolve { namespace matrix {
     return 0;
   }
 
-  int Sparse::setNewValues(real_type* new_vals, std::string memspace)
+  int Sparse::setNewValues(real_type* new_vals, memory::MemorySpace memspace)
   {
-
+    using namespace ReSolve::memory;
     setNotUpdated();
 
-    if (memspace == "cpu"){
-      this->h_val_data_ = new_vals;	
-      h_data_updated_ = true;
-    } else {
-      if (memspace == "cuda"){ 
+    switch (memspace) {
+      case HOST:
+        this->h_val_data_ = new_vals;	
+        h_data_updated_ = true;
+        break;
+      case DEVICE:
         this->d_val_data_ = new_vals;	
         d_data_updated_ = true;
-      } else {
+        break;
+      default:
         return -1;
-      }
     }
     return 0;
   }
diff --git a/resolve/matrix/Sparse.hpp b/resolve/matrix/Sparse.hpp
index 1196c38e..96121acb 100644
--- a/resolve/matrix/Sparse.hpp
+++ b/resolve/matrix/Sparse.hpp
@@ -31,31 +31,31 @@ namespace ReSolve { namespace matrix {
       void setExpanded(bool expanded);
       void setNnzExpanded(index_type nnz_expanded_new);
       void setNnz(index_type nnz_new); // for resetting when removing duplicates
-      index_type setUpdated(std::string what);
+      index_type setUpdated(memory::MemorySpace what);
 
-      virtual index_type* getRowData(std::string memspace) = 0;
-      virtual index_type* getColData(std::string memspace) = 0;
-      virtual real_type*  getValues(std::string memspace)  = 0;
+      virtual index_type* getRowData(memory::MemorySpace memspace) = 0;
+      virtual index_type* getColData(memory::MemorySpace memspace) = 0;
+      virtual real_type*  getValues( memory::MemorySpace memspace) = 0;
 
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) = 0;
-      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) = 0;
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0;
+      virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0;
 
-      virtual int allocateMatrixData(std::string memspace) = 0;
-      int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace);
+      virtual int allocateMatrixData(memory::MemorySpace memspace) = 0;
+      int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace);
 
-      int destroyMatrixData(std::string memspace);
+      int destroyMatrixData(memory::MemorySpace memspace);
 
       virtual void print() = 0;
 
-      virtual int copyData(std::string memspaceOut) = 0;
+      virtual int copyData(memory::MemorySpace memspaceOut) = 0;
 
 
       //update Values just updates values; it allocates if necessary.
       //values have the same dimensions between different formats 
-      virtual int updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut);
+      virtual int updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut);
       
       //set new values just sets the pointer, use caution.   
-      virtual int setNewValues(real_type* new_vals, std::string memspace);
+      virtual int setNewValues(real_type* new_vals, memory::MemorySpace memspace);
     
     protected:
       //size
diff --git a/resolve/matrix/io.cpp b/resolve/matrix/io.cpp
index 36fb5f1b..0d96a5e1 100644
--- a/resolve/matrix/io.cpp
+++ b/resolve/matrix/io.cpp
@@ -53,7 +53,7 @@ namespace ReSolve { namespace io {
       coo_vals[i] = c;
       i++;
     }
-    A->setMatrixData(coo_rows, coo_cols, coo_vals, "cpu");
+    A->setMatrixData(coo_rows, coo_cols, coo_vals, memory::HOST);
     return A;
   }
 
@@ -116,9 +116,9 @@ namespace ReSolve { namespace io {
     }
     A->setNnz(nnz);
     //create coo arrays
-    index_type* coo_rows = A->getRowData("cpu");
-    index_type* coo_cols = A->getColData("cpu");
-    real_type* coo_vals = A->getValues("cpu");
+    index_type* coo_rows = A->getRowData(memory::HOST);
+    index_type* coo_cols = A->getColData(memory::HOST);
+    real_type* coo_vals  = A->getValues( memory::HOST);
     i = 0;
     index_type a, b;
     real_type c;
@@ -171,7 +171,7 @@ namespace ReSolve { namespace io {
 
   int writeVectorToFile(vector_type* vec_x, std::ostream& file_out)
   {
-    real_type* x_data = vec_x->getData("cpu");
+    real_type* x_data = vec_x->getData(memory::HOST);
     // std::ofstream file_out (filename, std::ofstream::out);
     file_out << "%%MatrixMarket matrix array real general \n";
     file_out << "% ID: XXX \n";
diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp
index df3c475d..0a62bd02 100644
--- a/resolve/vector/Vector.cpp
+++ b/resolve/vector/Vector.cpp
@@ -52,52 +52,51 @@ namespace ReSolve { namespace vector {
     return k_;
   }
 
-  void Vector::setData(real_type* data, std::string memspace)
+  void Vector::setData(real_type* data, memory::MemorySpace memspace)
   {
-
-    if (memspace == "cpu") {
-      h_data_ = data;
-      cpu_updated_ = true;
-      gpu_updated_ = false;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        h_data_ = data;
+        cpu_updated_ = true;
+        gpu_updated_ = false;
+        break;
+      case DEVICE:
         d_data_ = data;
         gpu_updated_ = true;
         cpu_updated_ = false;
-      } else {
-        //error
-      } 
+        break;
     }
   }
 
-  void Vector::setDataUpdated(std::string memspace)
+  void Vector::setDataUpdated(memory::MemorySpace memspace)
   { 
-    if (memspace == "cpu") {
-      cpu_updated_ = true;
-      gpu_updated_ = false;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        cpu_updated_ = true;
+        gpu_updated_ = false;
+        break;
+      case DEVICE:
         gpu_updated_ = true;
         cpu_updated_ = false;
-      } else {
-        //error
-      } 
+        break;
     }
   }
 
-  int Vector::update(real_type* data, std::string memspaceIn, std::string memspaceOut)
+  int Vector::update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     int control=-1;
-    if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;}
-    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 1;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 2;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 3;}
+    if ((memspaceIn == memory::HOST)   && (memspaceOut == memory::HOST))  { control = 0;}
+    if ((memspaceIn == memory::HOST)   && (memspaceOut == memory::DEVICE)){ control = 1;}
+    if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST))  { control = 2;}
+    if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;}
 
-    if ((memspaceOut == "cpu") && (h_data_ == nullptr)){
+    if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) {
       //allocate first
       h_data_ = new real_type[n_ * k_]; 
     }
-    if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){
+    if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) {
       //allocate first
       mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
@@ -133,26 +132,26 @@ namespace ReSolve { namespace vector {
     return 0;
   }
 
-  real_type* Vector::getData(std::string memspace)
+  real_type* Vector::getData(memory::MemorySpace memspace)
   {
     return this->getData(0, memspace);
   }
 
-  real_type* Vector::getData(index_type i, std::string memspace)
+  real_type* Vector::getData(index_type i, memory::MemorySpace memspace)
   {
-    if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) {
-      copyData(memspace, "cpu");
+    if ((memspace == memory::HOST) && (cpu_updated_ == false) && (gpu_updated_ == true )) {
+      copyData(memspace, memory::HOST);
       owns_cpu_data_ = true;
     } 
 
-    if (((memspace == "cuda") || (memspace == "hip")) && (gpu_updated_ == false) && (cpu_updated_ == true )) {
-      copyData("cpu", memspace);
+    if ((memspace == memory::DEVICE) && (gpu_updated_ == false) && (cpu_updated_ == true )) {
+      copyData(memory::HOST, memspace);
       owns_gpu_data_ = true;
     }
-    if (memspace == "cpu") {
+    if (memspace == memory::HOST) {
       return &h_data_[i * n_current_];
     } else {
-      if ((memspace == "cuda") || (memspace == "hip")){
+      if (memspace == memory::DEVICE){
         return &d_data_[i * n_current_];
       } else {
         return nullptr;
@@ -161,17 +160,17 @@ namespace ReSolve { namespace vector {
   }
 
 
-  int Vector::copyData(std::string memspaceIn, std::string memspaceOut)
+  int Vector::copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut)
   {
     int control=-1;
-    if ((memspaceIn == "cpu") && ((memspaceOut == "cuda") || (memspaceOut == "hip"))){ control = 0;}
-    if (((memspaceIn == "cuda") || (memspaceIn == "hip")) && (memspaceOut == "cpu")){ control = 1;}
+    if ((memspaceIn == memory::HOST)   && (memspaceOut == memory::DEVICE)){ control = 0;}
+    if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST))  { control = 1;}
 
-    if ((memspaceOut == "cpu") && (h_data_ == nullptr)){
+    if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) {
       //allocate first
       h_data_ = new real_type[n_ * k_]; 
     }
-    if (((memspaceOut == "cuda") || (memspaceOut == "hip")) && (d_data_ == nullptr)){
+    if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) {
       //allocate first
       mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
@@ -193,110 +192,118 @@ namespace ReSolve { namespace vector {
     return 0;
   }
 
-  void Vector::allocate(std::string memspace) 
+  void Vector::allocate(memory::MemorySpace memspace) 
   {
-    if (memspace == "cpu") {
-      delete [] h_data_;
-      h_data_ = new real_type[n_ * k_]; 
-      owns_cpu_data_ = true;
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        delete [] h_data_;
+        h_data_ = new real_type[n_ * k_]; 
+        owns_cpu_data_ = true;
+        break;
+      case DEVICE:
         mem_.deleteOnDevice(d_data_);
         mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
         owns_gpu_data_ = true;
-      } else {
-        std::cout<<"wrong memspace " <<memspace<<" "<<std::endl;
-      }
+        break;
     }
   }
 
 
-  void Vector::setToZero(std::string memspace) 
+  void Vector::setToZero(memory::MemorySpace memspace) 
   {
-    if (memspace == "cpu") {
-      if (h_data_ == nullptr) {
-        h_data_ = new real_type[n_ * k_]; 
-        owns_cpu_data_ = true;
-      }
-      for (int i = 0; i < n_ * k_; ++i){
-        h_data_[i] = 0.0;
-      }
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        if (h_data_ == nullptr) {
+          h_data_ = new real_type[n_ * k_]; 
+          owns_cpu_data_ = true;
+        }
+        for (int i = 0; i < n_ * k_; ++i){
+          h_data_[i] = 0.0;
+        }
+        break;
+      case DEVICE:
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         mem_.setZeroArrayOnDevice(d_data_, n_ * k_);
-      }
+        break;
     }
   }
 
-  void Vector::setToZero(index_type j, std::string memspace) 
+  void Vector::setToZero(index_type j, memory::MemorySpace memspace) 
   {
-    if (memspace == "cpu") {
-      if (h_data_ == nullptr) {
-        h_data_ = new real_type[n_ * k_]; 
-        owns_cpu_data_ = true;
-      }
-      for (int i = (n_current_) * j; i < n_current_ * (j + 1); ++i) {
-        h_data_[i] = 0.0;
-      }
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        if (h_data_ == nullptr) {
+          h_data_ = new real_type[n_ * k_]; 
+          owns_cpu_data_ = true;
+        }
+        for (int i = (n_current_) * j; i < n_current_ * (j + 1); ++i) {
+          h_data_[i] = 0.0;
+        }
+        break;
+      case DEVICE:
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         // TODO: We should not need to access raw data in this class
         mem_.setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_);
-      }
+        break;
     }
   }
 
-  void Vector::setToConst(real_type C, std::string memspace) 
+  void Vector::setToConst(real_type C, memory::MemorySpace memspace) 
   {
-    if (memspace == "cpu") {
-      if (h_data_ == nullptr) {
-        h_data_ = new real_type[n_ * k_]; 
-        owns_cpu_data_ = true;
-      }
-      for (int i = 0; i < n_ * k_; ++i){
-        h_data_[i] = C;
-      }
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        if (h_data_ == nullptr) {
+          h_data_ = new real_type[n_ * k_]; 
+          owns_cpu_data_ = true;
+        }
+        for (int i = 0; i < n_ * k_; ++i){
+          h_data_[i] = C;
+        }
+        break;
+      case DEVICE:
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         set_array_const(n_ * k_, C, d_data_);
-      }
+        break;
     }
   }
 
-  void Vector::setToConst(index_type j, real_type C, std::string memspace) 
+  void Vector::setToConst(index_type j, real_type C, memory::MemorySpace memspace) 
   {
-    if (memspace == "cpu") {
-      if (h_data_ == nullptr) {
-        h_data_ = new real_type[n_ * k_]; 
-        owns_cpu_data_ = true;
-      }
-      for (int i = j * n_current_; i < (j + 1 ) * n_current_ * k_; ++i){
-        h_data_[i] = C;
-      }
-    } else {
-      if ((memspace == "cuda") || (memspace == "hip")) {
+    using namespace ReSolve::memory;
+    switch (memspace) {
+      case HOST:
+        if (h_data_ == nullptr) {
+          h_data_ = new real_type[n_ * k_]; 
+          owns_cpu_data_ = true;
+        }
+        for (int i = j * n_current_; i < (j + 1 ) * n_current_ * k_; ++i){
+          h_data_[i] = C;
+        }
+        break;
+      case DEVICE:
         if (d_data_ == nullptr) {
           mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         set_array_const(n_current_ * 1, C, &d_data_[n_current_ * j]);
-      }
+        break;
     }
   }
 
-  real_type* Vector::getVectorData(index_type i, std::string memspace)
+  real_type* Vector::getVectorData(index_type i, memory::MemorySpace memspace)
   {
     if (this->k_ < i){
       return nullptr;
@@ -315,38 +322,38 @@ namespace ReSolve { namespace vector {
     }
   }
 
-  int  Vector::deepCopyVectorData(real_type* dest, index_type i, std::string memspaceOut)
+  int  Vector::deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspaceOut)
   {
+    using namespace ReSolve::memory;
     if (i > this->k_) {
       return -1;
     } else {
       real_type* data = this->getData(i, memspaceOut);
-      if (memspaceOut == "cpu") {
-        mem_.copyArrayHostToHost(dest, data, n_current_);
-      } else {
-      if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
+      switch (memspaceOut) {
+        case HOST:
+          mem_.copyArrayHostToHost(dest, data, n_current_);
+          break;
+        case DEVICE:
           mem_.copyArrayDeviceToDevice(dest, data, n_current_);
-        } else {
-          //error
-        } 
+          break;
       }
       return 0;
     }
   }  
 
-  int  Vector::deepCopyVectorData(real_type* dest, std::string memspaceOut)
+  int  Vector::deepCopyVectorData(real_type* dest, memory::MemorySpace memspaceOut)
   {
+    using namespace ReSolve::memory;
     real_type* data = this->getData(memspaceOut);
-    if (memspaceOut == "cpu") {
-      mem_.copyArrayHostToHost(dest, data, n_current_ * k_);
-    } else {
-      if ((memspaceOut == "cuda") || (memspaceOut == "hip")) {
+    switch (memspaceOut) {
+      case HOST:
+        mem_.copyArrayHostToHost(dest, data, n_current_ * k_);
+        break;
+      case DEVICE:
         mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_);
-      } else {
-        //error
-      } 
+        break;
     }
     return 0;
-
   }
+
 }} // namespace ReSolve::vector
diff --git a/resolve/vector/Vector.hpp b/resolve/vector/Vector.hpp
index 9d1bd452..5f86ef7f 100644
--- a/resolve/vector/Vector.hpp
+++ b/resolve/vector/Vector.hpp
@@ -11,26 +11,26 @@ namespace ReSolve { namespace vector {
       Vector(index_type n, index_type k);
       ~Vector();
 
-      int update(real_type* data, std::string memspaceIn, std::string memspaceOut);
-      real_type* getData(std::string memspace);
-      real_type* getData(index_type i, std::string memspace); // get pointer to i-th vector in multivector
+      int update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut);
+      real_type* getData(memory::MemorySpace memspace);
+      real_type* getData(index_type i, memory::MemorySpace memspace); // get pointer to i-th vector in multivector
 
       index_type getSize();
       index_type getCurrentSize();
       index_type getNumVectors();
 
-      void setDataUpdated(std::string memspace);
-      void setData(real_type* data, std::string memspace);
-      void allocate(std::string memspace);   
-      void setToZero(std::string memspace);
-      void setToZero(index_type i, std::string memspace); // set i-th ivector to 0
-      void setToConst(real_type C, std::string memspace);
-      void setToConst(index_type i, real_type C, std::string memspace); // set i-th vector to C  - needed for unit tests, Gram Schmidt tests
-      int copyData(std::string memspaceIn, std::string memspaceOut); 
+      void setDataUpdated(memory::MemorySpace memspace);
+      void setData(real_type* data, memory::MemorySpace memspace);
+      void allocate(memory::MemorySpace memspace);   
+      void setToZero(memory::MemorySpace memspace);
+      void setToZero(index_type i, memory::MemorySpace memspace); // set i-th ivector to 0
+      void setToConst(real_type C, memory::MemorySpace memspace);
+      void setToConst(index_type i, real_type C, memory::MemorySpace memspace); // set i-th vector to C  - needed for unit tests, Gram Schmidt tests
+      int copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); 
       int setCurrentSize(index_type new_n_current);
-      real_type* getVectorData(index_type i, std::string memspace); // get ith vector data out of multivector   
-      int  deepCopyVectorData(real_type* dest, index_type i, std::string memspace);  
-      int  deepCopyVectorData(real_type* dest, std::string memspace);  //copy FULL multivector 
+      real_type* getVectorData(index_type i, memory::MemorySpace memspace); // get ith vector data out of multivector   
+      int  deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspace);  
+      int  deepCopyVectorData(real_type* dest, memory::MemorySpace memspace);  //copy FULL multivector 
     
     private:
       index_type n_; ///< size
diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp
index f5cc463d..a8317a89 100644
--- a/resolve/vector/VectorHandlerCpu.cpp
+++ b/resolve/vector/VectorHandlerCpu.cpp
@@ -47,8 +47,8 @@ namespace ReSolve {
 
   real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y)
   { 
-    real_type* x_data = x->getData("cpu");
-    real_type* y_data = y->getData("cpu");
+    real_type* x_data = x->getData(memory::HOST);
+    real_type* y_data = y->getData(memory::HOST);
     real_type sum = 0.0;
     real_type c = 0.0;
     // real_type t, y;
@@ -72,7 +72,7 @@ namespace ReSolve {
    */
   void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x)
   {
-    real_type* x_data = x->getData("cpu");
+    real_type* x_data = x->getData(memory::HOST);
 
     for (int i = 0; i < x->getSize(); ++i){
       x_data[i] *= (*alpha);
@@ -91,8 +91,8 @@ namespace ReSolve {
   void VectorHandlerCpu::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
   {
     //AXPY:  y = alpha * x + y
-    real_type* x_data = x->getData("cpu");
-    real_type* y_data = y->getData("cpu");
+    real_type* x_data = x->getData(memory::HOST);
+    real_type* y_data = y->getData(memory::HOST);
     for (int i = 0; i < x->getSize(); ++i) {
       y_data[i] = (*alpha) * x_data[i] + y_data[i];
     }
diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp
index 3c887e85..5871fd5a 100644
--- a/resolve/vector/VectorHandlerCuda.cpp
+++ b/resolve/vector/VectorHandlerCuda.cpp
@@ -50,7 +50,7 @@ namespace ReSolve {
     LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
     cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
     double nrm = 0.0;
-    cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
+    cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm);
     if (st!=0) {printf("dot product crashed with code %d \n", st);}
     return nrm;
   }
@@ -67,7 +67,7 @@ namespace ReSolve {
   {
     LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
     cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-    cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
+    cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData(memory::DEVICE), 1);
     if (st!=0) {
       ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
     }
@@ -90,9 +90,9 @@ namespace ReSolve {
     cublasDaxpy(handle_cublas,
                 x->getSize(),
                 alpha,
-                x->getData("cuda"),
+                x->getData(memory::DEVICE),
                 1,
-                y->getData("cuda"),
+                y->getData(memory::DEVICE),
                 1);
   }
 
@@ -131,12 +131,12 @@ namespace ReSolve {
                   n,
                   k,
                   alpha,
-                  V->getData("cuda"),
+                  V->getData(memory::DEVICE),
                   n,
-                  y->getData("cuda"),
+                  y->getData(memory::DEVICE),
                   1,
                   beta,
-                  x->getData("cuda"),
+                  x->getData(memory::DEVICE),
                   1);
 
     } else {
@@ -145,12 +145,12 @@ namespace ReSolve {
                   n,
                   k,
                   alpha,
-                  V->getData("cuda"),
+                  V->getData(memory::DEVICE),
                   n,
-                  y->getData("cuda"),
+                  y->getData(memory::DEVICE),
                   1,
                   beta,
-                  x->getData("cuda"),
+                  x->getData(memory::DEVICE),
                   1);
     }
   }
@@ -171,7 +171,7 @@ namespace ReSolve {
   {
     using namespace constants;
     if (k < 200) {
-      mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
+      mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE));
     } else {
       LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
       cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
@@ -182,12 +182,12 @@ namespace ReSolve {
                   1,          // n
                   k + 1,      // k
                   &MINUSONE, // alpha
-                  x->getData("cuda"), // A
+                  x->getData(memory::DEVICE), // A
                   size,       // lda
-                  alpha->getData("cuda"), // B
+                  alpha->getData(memory::DEVICE), // B
                   k + 1,      // ldb
                   &ONE,
-                  y->getData("cuda"),          // c
+                  y->getData(memory::DEVICE),          // c
                   size);      // ldc     
     }
   }
@@ -212,7 +212,7 @@ namespace ReSolve {
     using namespace constants;
 
     if (k < 200) {
-      mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
+      mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE));
     } else {
       LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
       cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
@@ -223,12 +223,12 @@ namespace ReSolve {
                   2,       //n
                   size,    //k
                   &ONE,   //alpha
-                  V->getData("cuda"),       //A
+                  V->getData(memory::DEVICE),       //A
                   size,    //lda
-                  x->getData("cuda"),       //B
+                  x->getData(memory::DEVICE),       //B
                   size,    //ldb
                   &ZERO,
-                  res->getData("cuda"),     //c
+                  res->getData(memory::DEVICE),     //c
                   k + 1);  //ldc 
     }
   }
diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp
index 9f2927c7..1e1195fc 100644
--- a/resolve/vector/VectorHandlerHip.cpp
+++ b/resolve/vector/VectorHandlerHip.cpp
@@ -50,7 +50,7 @@ namespace ReSolve {
     LinAlgWorkspaceHIP* workspaceHIP = workspace_;
     rocblas_handle  handle_rocblas =  workspaceHIP->getRocblasHandle();
     double nrm = 0.0;
-    rocblas_status st= rocblas_ddot (handle_rocblas,  x->getSize(), x->getData("hip"), 1, y->getData("hip"), 1, &nrm);
+    rocblas_status st= rocblas_ddot (handle_rocblas,  x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm);
     if (st!=0) {printf("dot product crashed with code %d \n", st);}
     return nrm;
   }
@@ -67,7 +67,7 @@ namespace ReSolve {
   {
     LinAlgWorkspaceHIP* workspaceHIP = workspace_;
     rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
-    rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData("hip"), 1);
+    rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData(memory::DEVICE), 1);
     if (st!=0) {
       ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
     }
@@ -90,9 +90,9 @@ namespace ReSolve {
     rocblas_daxpy(handle_rocblas,
                   x->getSize(),
                   alpha,
-                  x->getData("hip"),
+                  x->getData(memory::DEVICE),
                   1,
-                  y->getData("hip"),
+                  y->getData(memory::DEVICE),
                   1);
   }
 
@@ -131,12 +131,12 @@ namespace ReSolve {
                     n,
                     k,
                     alpha,
-                    V->getData("hip"),
+                    V->getData(memory::DEVICE),
                     n,
-                    y->getData("hip"),
+                    y->getData(memory::DEVICE),
                     1,
                     beta,
-                    x->getData("hip"),
+                    x->getData(memory::DEVICE),
                     1);
 
     } else {
@@ -145,12 +145,12 @@ namespace ReSolve {
                     n,
                     k,
                     alpha,
-                    V->getData("hip"),
+                    V->getData(memory::DEVICE),
                     n,
-                    y->getData("hip"),
+                    y->getData(memory::DEVICE),
                     1,
                     beta,
-                    x->getData("hip"),
+                    x->getData(memory::DEVICE),
                     1);
     }
   }
@@ -171,7 +171,7 @@ namespace ReSolve {
   {
     using namespace constants;
     if (k < 200) {
-      mass_axpy(size, k, x->getData("hip"), y->getData("hip"),alpha->getData("hip"));
+      mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE));
     } else {
       LinAlgWorkspaceHIP* workspaceHIP = workspace_;
       rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
@@ -182,12 +182,12 @@ namespace ReSolve {
                     1,          // n
                     k,      // k
                     &MINUSONE, // alpha
-                    x->getData("hip"), // A
+                    x->getData(memory::DEVICE), // A
                     size,       // lda
-                    alpha->getData("hip"), // B
+                    alpha->getData(memory::DEVICE), // B
                     k,      // ldb
                     &ONE,
-                    y->getData("hip"),          // c
+                    y->getData(memory::DEVICE),          // c
                     size);      // ldc     
     }
   }
@@ -212,7 +212,7 @@ namespace ReSolve {
     using namespace constants;
 
     if (k < 200) {
-      mass_inner_product_two_vectors(size, k, x->getData("hip") , x->getData(1, "hip"), V->getData("hip"), res->getData("hip"));
+      mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE));
     } else {
       LinAlgWorkspaceHIP* workspaceHIP = workspace_;
       rocblas_handle handle_rocblas =  workspaceHIP->getRocblasHandle();
@@ -223,12 +223,12 @@ namespace ReSolve {
                     2,       //n
                     size,    //k
                     &ONE,   //alpha
-                    V->getData("hip"),       //A
+                    V->getData(memory::DEVICE),       //A
                     size,    //lda
-                    x->getData("hip"),       //B
+                    x->getData(memory::DEVICE),       //B
                     size,    //ldb
                     &ZERO,
-                    res->getData("hip"),     //c
+                    res->getData(memory::DEVICE),     //c
                     k + 1);  //ldc 
     }
   }
diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp
index b067f417..083c11d1 100644
--- a/tests/functionality/testKLU.cpp
+++ b/tests/functionality/testKLU.cpp
@@ -74,8 +74,8 @@ int main(int argc, char *argv[])
 
   // Convert first matrix to CSR format
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
-  vec_rhs->setDataUpdated("cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
 
   // Solve the first system using KLU
   status = KLU->setup(A);
@@ -100,11 +100,11 @@ int main(int argc, char *argv[])
     x_data[i] = 1.0;
   }
 
-  vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cpu");
-  vec_diff->update(x_data, "cpu", "cpu");
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
-  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu"));
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::HOST));
   matrix_handler->setValuesChanged(true, "cpu");
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); 
   error_sum += status;
@@ -123,13 +123,13 @@ int main(int argc, char *argv[])
   real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); 
   error_sum += status;
   real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
   //evaluate the residual ON THE CPU using COMPUTED solution
  
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
@@ -165,7 +165,7 @@ int main(int argc, char *argv[])
   rhs2_file.close();
 
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   // and solve it too
   status =  KLU->refactorize();
@@ -174,7 +174,7 @@ int main(int argc, char *argv[])
   status = KLU->solve(vec_rhs, vec_x);
   error_sum += status;
 
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
   matrix_handler->setValuesChanged(true, "cpu");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
@@ -185,13 +185,13 @@ int main(int argc, char *argv[])
   //for testing only - control
   real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cpu");
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST);
   vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu");
   //evaluate its norm
   real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); 
   error_sum += status;
   real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu"));
diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp
index ddaf3b31..702141ec 100644
--- a/tests/functionality/testKLU_GLU.cpp
+++ b/tests/functionality/testKLU_GLU.cpp
@@ -75,15 +75,15 @@ int main(int argc, char *argv[])
   real_type* x   = new real_type[A->getNumRows()];
   vector_type* vec_rhs = new vector_type(A->getNumRows());
   vector_type* vec_x   = new vector_type(A->getNumRows());
-  vec_x->allocate("cpu");//for KLU
-  vec_x->allocate("cuda");
+  vec_x->allocate(ReSolve::memory::HOST);//for KLU
+  vec_x->allocate(ReSolve::memory::DEVICE);
   vector_type* vec_r   = new vector_type(A->getNumRows());
   rhs1_file.close();
 
   // Convert first matrix to CSR format
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
-  vec_rhs->setDataUpdated("cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
 
   // Solve the first system using KLU
   status = KLU->setup(A);
@@ -106,7 +106,7 @@ int main(int argc, char *argv[])
   status = GLU->setup(A, L, U, P, Q); 
   error_sum += status;
   std::cout<<"GLU setup status: "<<status<<std::endl;      
-  vec_rhs->update(rhs, "cpu", "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = GLU->solve(vec_rhs, vec_x);
   error_sum += status;
   std::cout<<"GLU solve status: "<<status<<std::endl;      
@@ -122,9 +122,9 @@ int main(int argc, char *argv[])
     x_data[i] = 1.0;
   }
 
-  vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cuda");
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
   matrix_handler->setValuesChanged(true, "cuda");
@@ -145,13 +145,13 @@ int main(int argc, char *argv[])
   real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
  
   //compute the residual using exact solution
-  vec_x->update(vec_x->getData("cuda"), "cuda", "cpu");
+  vec_x->update(vec_x->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::HOST);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
   //evaluate the residual ON THE CPU using COMPUTED solution
  
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
@@ -188,7 +188,7 @@ int main(int argc, char *argv[])
   rhs2_file.close();
 
   matrix_handler->coo2csr(A_coo, A, "cuda");
-  vec_rhs->update(rhs, "cpu", "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
   status = GLU->refactorize();
   error_sum += status;
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
   status = GLU->solve(vec_rhs, vec_x);
   error_sum += status;
 
-   vec_r->update(rhs, "cpu", "cuda");
+   vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
    matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
@@ -208,13 +208,13 @@ int main(int argc, char *argv[])
   //for testing only - control
   real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
   //evaluate its norm
   real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp
index 124f07de..a136017e 100644
--- a/tests/functionality/testKLU_Rf.cpp
+++ b/tests/functionality/testKLU_Rf.cpp
@@ -80,8 +80,8 @@ int main(int argc, char *argv[])
 
   // Convert first matrix to CSR format
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
-  vec_rhs->setDataUpdated("cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
 
   // Solve the first system using KLU
   status = KLU->setup(A);
@@ -106,9 +106,9 @@ int main(int argc, char *argv[])
     x_data[i] = 1.0;
   }
 
-  vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cuda");
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
   // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
   matrix_handler->setValuesChanged(true, "cuda");
@@ -129,13 +129,13 @@ int main(int argc, char *argv[])
   real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
   //evaluate the residual ON THE CPU using COMPUTED solution
  
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
@@ -186,7 +186,7 @@ int main(int argc, char *argv[])
   rhs2_file.close();
 
   matrix_handler->coo2csr(A_coo, A, "cuda");
-  vec_rhs->update(rhs, "cpu", "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
   status = Rf->refactorize();
   error_sum += status;
@@ -194,7 +194,7 @@ int main(int argc, char *argv[])
   status = Rf->solve(vec_rhs, vec_x);
   error_sum += status;
 
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   matrix_handler->setValuesChanged(true, "cuda");
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
@@ -205,13 +205,13 @@ int main(int argc, char *argv[])
   //for testing only - control
   real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
   //evaluate its norm
   real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
  
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index 6601a3ee..6a81dac1 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -85,8 +85,8 @@ int main(int argc, char *argv[])
 
   // Convert first matrix to CSR format
   matrix_handler->coo2csr(A_coo, A, "cpu");
-  vec_rhs->update(rhs, "cpu", "cpu");
-  vec_rhs->setDataUpdated("cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
 
   // Solve the first system using KLU
   status = KLU->setup(A);
@@ -112,11 +112,11 @@ int main(int argc, char *argv[])
     x_data[i] = 1.0;
   }
 
-  vec_test->setData(x_data, "cpu");
-  vec_r->update(rhs, "cpu", "cuda");
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
-  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda"));
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE));
   matrix_handler->setValuesChanged(true, "cuda");
   //evaluate the residual ||b-Ax||
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); 
@@ -136,13 +136,13 @@ int main(int argc, char *argv[])
   real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
 
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
   //evaluate the residual ON THE CPU using COMPUTED solution
 
-  vec_r->update(rhs, "cpu", "cpu");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
 
   status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
   error_sum += status;
@@ -202,13 +202,13 @@ int main(int argc, char *argv[])
   rhs2_file.close();
 
   matrix_handler->coo2csr(A_coo, A, "cuda");
-  vec_rhs->update(rhs, "cpu", "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   Rf->setNumericalProperties(1e-12, 1e-1);
 
   status = Rf->refactorize();
   error_sum += status;
   
-  vec_x->update(rhs, "cpu", "cuda");
+  vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = Rf->solve(vec_x);
   error_sum += status;
   
@@ -216,11 +216,11 @@ int main(int argc, char *argv[])
   status = FGMRES->setupPreconditioner("CuSolverRf", Rf);
   error_sum += status;
 
-  vec_rhs->update(rhs, "cpu", "cuda");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = FGMRES->solve(vec_rhs, vec_x);
   error_sum += status;
 
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   matrix_handler->setValuesChanged(true, "cuda");
 
   //evaluate final residual
@@ -233,13 +233,13 @@ int main(int argc, char *argv[])
   //for testing only - control
   real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda"));
   //compute x-x_true
-  vec_diff->update(x_data, "cpu", "cuda");
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda");
   //evaluate its norm
   real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda"));
 
   //compute the residual using exact solution
-  vec_r->update(rhs, "cpu", "cuda");
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
   status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); 
   error_sum += status;
   real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda"));
diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp
index 0bcfe544..63d2f49b 100644
--- a/tests/unit/matrix/MatrixHandlerTests.hpp
+++ b/tests/unit/matrix/MatrixHandlerTests.hpp
@@ -42,18 +42,23 @@ class MatrixHandlerTests : TestBase
   TestOutcome matVec(index_type N)
   {
     TestStatus status;
+    ReSolve::memory::MemorySpace ms;
+    if (memspace_ == "cpu")
+      ms = memory::HOST;
+    else
+      ms = memory::DEVICE;
 
     ReSolve::MatrixHandler* handler = createMatrixHandler();
 
     matrix::Csr* A = createCsrMatrix(N, memspace_);
     vector::Vector x(N);
     vector::Vector y(N);
-    x.allocate(memspace_);
-    if (x.getData(memspace_) == NULL) printf("oups we have an issue \n");
-    y.allocate(memspace_);
+    x.allocate(ms);
+    if (x.getData(ms) == NULL) printf("oups we have an issue \n");
+    y.allocate(ms);
 
-    x.setToConst(1.0, memspace_);
-    y.setToConst(1.0, memspace_);
+    x.setToConst(1.0, ms);
+    y.setToConst(1.0, ms);
 
     real_type alpha = 2.0/30.0;
     real_type beta  = 2.0;
@@ -98,14 +103,14 @@ class MatrixHandlerTests : TestBase
   {
     bool status = true;
     if (memspace != "cpu") {
-      x.copyData(memspace, "cpu");
+      x.copyData(memory::DEVICE, memory::HOST);
     }
 
     for (index_type i = 0; i < x.getSize(); ++i) {
-      // std::cout << x.getData("cpu")[i] << "\n";
-      if (!isEqual(x.getData("cpu")[i], answer)) {
+      // std::cout << x.getData(memory::HOST)[i] << "\n";
+      if (!isEqual(x.getData(memory::HOST)[i], answer)) {
         status = false;
-        std::cout << "Solution vector element x[" << i << "] = " << x.getData("cpu")[i]
+        std::cout << "Solution vector element x[" << i << "] = " << x.getData(memory::HOST)[i]
                   << ", expected: " << answer << "\n";
         break; 
       }
@@ -135,11 +140,11 @@ class MatrixHandlerTests : TestBase
 
     // Allocate NxN CSR matrix with NNZ nonzeros
     matrix::Csr* A = new matrix::Csr(N, N, NNZ);
-    A->allocateMatrixData("cpu");
+    A->allocateMatrixData(memory::HOST);
 
-    index_type* rowptr = A->getRowData("cpu");
-    index_type* colidx = A->getColData("cpu");
-    real_type* val     = A->getValues("cpu"); 
+    index_type* rowptr = A->getRowData(memory::HOST);
+    index_type* colidx = A->getColData(memory::HOST);
+    real_type* val     = A->getValues( memory::HOST); 
 
     // Populate CSR matrix using same row pattern as for NNZ calculation
     rowptr[0] = 0;
@@ -157,10 +162,10 @@ class MatrixHandlerTests : TestBase
         val[j] = row_sample[static_cast<size_t>(j - rowptr[i])];
       }
     }
-    A->setUpdated("cpu");
+    A->setUpdated(memory::HOST);
 
     if ((memspace == "cuda") || (memspace == "hip")) {
-      A->copyData(memspace);
+      A->copyData(memory::DEVICE);
     }
 
     return A;
diff --git a/tests/unit/matrix/MatrixIoTests.hpp b/tests/unit/matrix/MatrixIoTests.hpp
index ad14f0a7..1ce23ae2 100644
--- a/tests/unit/matrix/MatrixIoTests.hpp
+++ b/tests/unit/matrix/MatrixIoTests.hpp
@@ -78,7 +78,7 @@ class MatrixIoTests : TestBase
 
     // Create a 5x5 COO matrix with 10 nonzeros
     ReSolve::matrix::Coo A(5, 5, 10);
-    A.allocateMatrixData("cpu");
+    A.allocateMatrixData(memory::HOST);
 
     // Read string into istream and status it to `readMatrixFromFile` function.
     std::istringstream file2(symmetric_coo_matrix_file_);
@@ -176,9 +176,9 @@ class MatrixIoTests : TestBase
                     const std::vector<real_type>& val_data)
   {
     for (size_t i = 0; i < val_data.size(); ++i) {
-      if ((answer.getRowData("cpu")[i] != row_data[i]) ||
-          (answer.getColData("cpu")[i] != col_data[i]) ||
-          (!isEqual(answer.getValues("cpu")[i], val_data[i])))
+      if ((answer.getRowData(memory::HOST)[i] != row_data[i]) ||
+          (answer.getColData(memory::HOST)[i] != col_data[i]) ||
+          (!isEqual(answer.getValues(memory::HOST)[i], val_data[i])))
       {
         std::cout << "Incorrect matrix value at storage element " << i << ".\n";
         return false;
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 9981ea48..4837b57b 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -66,15 +66,21 @@ namespace ReSolve {
               break;
           }
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors
           real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2
           real_type* aux_data; // needed for setup
 
-          V->allocate(memspace_);
-          if (memspace_ != "cpu") {
-            V->allocate("cpu");
+          V->allocate(ms);
+          if (ms != memory::HOST) {
+            V->allocate(memory::HOST);
           }
 
 
@@ -82,7 +88,7 @@ namespace ReSolve {
           GS->setup(N, 3);
           
           //fill 2nd and 3rd vector with values
-          aux_data = V->getVectorData(1, "cpu");
+          aux_data = V->getVectorData(1, memory::HOST);
           for (int i = 0; i < N; ++i) {
             if ( i % 2 == 0) {         
               aux_data[i] = constants::ONE;
@@ -90,7 +96,7 @@ namespace ReSolve {
               aux_data[i] = var1;
             }
           }
-          aux_data = V->getVectorData(2, "cpu");
+          aux_data = V->getVectorData(2, memory::HOST);
           for (int i = 0; i < N; ++i) {
             if ( i % 3 > 0) {         
               aux_data[i] = constants::ZERO;
@@ -98,11 +104,11 @@ namespace ReSolve {
               aux_data[i] = var2;
             }
           }
-          V->setDataUpdated("cpu"); 
-          V->copyData("cpu", memspace_);
+          V->setDataUpdated(memory::HOST); 
+          V->copyData(memory::HOST, ms);
 
           //set the first vector to all 1s, normalize 
-          V->setToConst(0, 1.0, memspace_);
+          V->setToConst(0, 1.0, ms);
           real_type nrm = handler->dot(V, V, memspace_);
           nrm = sqrt(nrm);
           nrm = 1.0 / nrm;
@@ -144,6 +150,12 @@ namespace ReSolve {
         // x is a multivector containing K vectors 
         bool verifyAnswer(vector::Vector* x, index_type K,  ReSolve::VectorHandler* handler, std::string memspace)
         {
+          ReSolve::memory::MemorySpace ms;
+          if (memspace == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           vector::Vector* a = new vector::Vector(x->getSize()); 
           vector::Vector* b = new vector::Vector(x->getSize());
 
@@ -152,8 +164,8 @@ namespace ReSolve {
 
           for (index_type i = 0; i < K; ++i) {
             for (index_type j = 0; j < K; ++j) {
-              a->update(x->getVectorData(i, memspace), memspace, "cpu");
-              b->update(x->getVectorData(j, memspace), memspace, "cpu");
+              a->update(x->getVectorData(i, ms), ms, memory::HOST);
+              b->update(x->getVectorData(j, ms), ms, memory::HOST);
               ip = handler->dot(a, b, "cpu");
               
               if ( (i != j) && (abs(ip) > 1e-14)) {
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index 60020ec5..856bb84d 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -39,16 +39,22 @@ namespace ReSolve {
         {
           TestStatus status;
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
 
-          x->allocate(memspace_);
-          y->allocate(memspace_);
+          x->allocate(ms);
+          y->allocate(ms);
 
-          x->setToConst(3.0, memspace_);
-          y->setToConst(1.0, memspace_);
+          x->setToConst(3.0, ms);
+          y->setToConst(1.0, ms);
 
           real_type alpha = 0.5;
           //the result is a vector with y[i] = 2.5;          
@@ -66,16 +72,22 @@ namespace ReSolve {
         {
           TestStatus status;
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
 
-          x->allocate(memspace_);
-          y->allocate(memspace_);
+          x->allocate(ms);
+          y->allocate(ms);
 
-          x->setToConst(0.25, memspace_);
-          y->setToConst(4.0, memspace_);
+          x->setToConst(0.25, ms);
+          y->setToConst(4.0, ms);
           real_type ans;
           //the result is N
           ans = handler->dot(x, y, memspace_);
@@ -98,13 +110,19 @@ namespace ReSolve {
         {
           TestStatus status;
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x =  new vector::Vector(N);
 
-          x->allocate(memspace_);
+          x->allocate(ms);
 
-          x->setToConst(1.25, memspace_);
+          x->setToConst(1.25, ms);
 
           real_type alpha = 3.5;
 
@@ -122,17 +140,23 @@ namespace ReSolve {
         {
           TestStatus status;
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N);
           vector::Vector* alpha = new vector::Vector(K);;
-          x->allocate(memspace_);
-          y->allocate(memspace_);
-          alpha->allocate(memspace_);
+          x->allocate(ms);
+          y->allocate(ms);
+          alpha->allocate(ms);
 
-          y->setToConst(2.0, memspace_);
-          alpha->setToConst(-1.0, memspace_);
+          y->setToConst(2.0, ms);
+          alpha->setToConst(-1.0, ms);
           for (int ii = 0; ii < K; ++ii) {
             real_type c;
             if (ii % 2 == 0) {
@@ -140,7 +164,7 @@ namespace ReSolve {
             } else {
               c = 0.5;
             }
-            x->setToConst(ii, c, memspace_);
+            x->setToConst(ii, c, ms);
           }
 
           index_type r = K % 2;
@@ -161,17 +185,23 @@ namespace ReSolve {
         {
           TestStatus status;
 
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N, 2);
           vector::Vector* res = new vector::Vector(K, 2);
-          x->allocate(memspace_);
-          y->allocate(memspace_);
-          res->allocate(memspace_);
+          x->allocate(ms);
+          y->allocate(ms);
+          res->allocate(ms);
           
-          x->setToConst(1.0, memspace_);
-          y->setToConst(-1.0, memspace_);
+          x->setToConst(1.0, ms);
+          y->setToConst(-1.0, ms);
           handler->massDot2Vec(N, x, K, y, res, memspace_);
           
           status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_);
@@ -186,6 +216,13 @@ namespace ReSolve {
         TestOutcome gemv(index_type N,  index_type K)
         {
           TestStatus status;
+
+          ReSolve::memory::MemorySpace ms;
+          if (memspace_ == "cpu")
+            ms = memory::HOST;
+          else
+            ms = memory::DEVICE;
+
           ReSolve::VectorHandler* handler = createVectorHandler();
           vector::Vector* V = new vector::Vector(N, K);
           // for the test with NO TRANSPOSE
@@ -195,17 +232,17 @@ namespace ReSolve {
           vector::Vector* yT = new vector::Vector(N);
           vector::Vector* xT = new vector::Vector(K);
           
-          V->allocate(memspace_);
-          yN->allocate(memspace_);
-          xN->allocate(memspace_);
-          yT->allocate(memspace_);
-          xT->allocate(memspace_);
-
-          V->setToConst(1.0, memspace_);
-          yN->setToConst(-1.0, memspace_);
-          xN->setToConst(.5, memspace_);
-          yT->setToConst(-1.0, memspace_);
-          xT->setToConst(.5, memspace_);
+          V->allocate(ms);
+          yN->allocate(ms);
+          xN->allocate(ms);
+          yT->allocate(ms);
+          xT->allocate(ms);
+
+          V->setToConst(1.0, ms);
+          yN->setToConst(-1.0, ms);
+          xN->setToConst(.5, ms);
+          yT->setToConst(-1.0, ms);
+          xT->setToConst(.5, ms);
           
           real_type alpha = -1.0;
           real_type beta = 1.0;
@@ -248,15 +285,15 @@ namespace ReSolve {
         {
           bool status = true;
           if (memspace != "cpu") {
-            x->copyData(memspace, "cpu");
+            x->copyData(memory::DEVICE, memory::HOST);
           }
 
           for (index_type i = 0; i < x->getSize(); ++i) {
             // std::cout << x->getData("cpu")[i] << "\n";
-            if (!isEqual(x->getData("cpu")[i], answer)) {
+            if (!isEqual(x->getData(memory::HOST)[i], answer)) {
               std::cout << std::setprecision(16);
               status = false;
-              std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i]
+              std::cout << "Solution vector element x[" << i << "] = " << x->getData(memory::HOST)[i]
                 << ", expected: " << answer << "\n";
               break; 
             }

From 9a5fd7adba72170f40a0e91c87171dc9c1ae3659 Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Wed, 1 Nov 2023 13:51:40 -0400
Subject: [PATCH 05/12] Review CUDA and HIP configuration in CMake (#48)

* Add first pass CMake cleanup - need to fix include_directories and add custom library macro.

* Fix find HIP cmake function.

* Update CMakePresets.json

Co-authored-by: Nicholson Koukpaizan <72402802+nkoukpaizan@users.noreply.github.com>


---------

Co-authored-by: rcrutherford <robert.rutherford@pnnl.gov>
Co-authored-by: Nicholson Koukpaizan <72402802+nkoukpaizan@users.noreply.github.com>
---
 CMakeLists.txt                      | 30 ++++++++++---------------
 CMakePresets.json                   | 11 +++++-----
 cmake/ReSolveConfig.cmake.in        | 12 ++++++++++
 cmake/ReSolveFindHipLibraries.cmake | 16 +++++---------
 resolve/CMakeLists.txt              | 34 ++++++-----------------------
 5 files changed, 40 insertions(+), 63 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9f802231..db4e8e74 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,42 +23,31 @@ endif()
 
 option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF)
 option(RESOLVE_USE_KLU  "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON)
-option(RESOLVE_USE_GPU  "Use GPU device for computations" OFF)
 option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF)
 option(RESOLVE_USE_HIP  "Use HIP language and ROCm library" OFF)
-set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
+
+option(RESOLVE_USE_GPU  "Use GPU device for computations" OFF)
+mark_as_advanced(FORCE RESOLVE_USE_GPU)
 
 if(RESOLVE_USE_CUDA)
-  set(RESOLVE_USE_GPU On CACHE BOOL "Using CUDA GPU!" FORCE)
+  set(RESOLVE_USE_GPU ON CACHE BOOL "Using CUDA GPU!" FORCE)
 endif()
 
 if(RESOLVE_USE_HIP)
-  set(RESOLVE_USE_GPU On CACHE BOOL "Using HIP GPU!" FORCE)
+  set(RESOLVE_USE_GPU ON CACHE BOOL "Using HIP GPU!" FORCE)
 endif()
 
-
+# MacOS specific things
 set(CMAKE_MACOSX_RPATH 1)
-# set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
-# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath
-# use, i.e. don't skip the full RPATH for the build tree
-#set(CMAKE_SKIP_BUILD_RPATH FALSE)
 
-# when building, don't use the install RPATH already
-# (but later on when installing)
+# Install with RPATH but do not build with it
 set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
-
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
-
 set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
-#list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
 
 # Add CMake sources from `cmake` dir
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 
-# Including clang-format cmake files to do automatic checking of formating
-# TODO: Set up clang-format
-#include(./cmake/clang-format)
-
 if (RESOLVE_USE_KLU)
   include(FindKLU)
   if(NOT KLU_LIBRARY)
@@ -100,6 +89,7 @@ if(RESOLVE_USE_HIP)
   # This is just an agly hack to make HIP build work
   get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
   message(STATUS "HIP include directories: ${hip_includes}")
+  # TODO - use targets properly
   include_directories(${hip_includes})
 else()
   message(STATUS "Not using HIP")
@@ -112,6 +102,7 @@ configure_file(
   ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp)
 
 # include build directory for Fortran name mangling header
+# TODO - target based includes
 include_directories(${CMAKE_BINARY_DIR})
 
 install(
@@ -119,7 +110,7 @@ install(
   DESTINATION include/resolve
   )
 
-
+# TODO - fix this
 include_directories(${CMAKE_SOURCE_DIR})
 
 # Enable testing
@@ -158,4 +149,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake"
 add_subdirectory(examples)
 
 # Add tests
+set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
 add_subdirectory(tests)
diff --git a/CMakePresets.json b/CMakePresets.json
index e4784095..c00f9919 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -12,7 +12,10 @@
             "description": "Base config to build with CUDA",
             "binaryDir": "${sourceDir}/build",
             "installDir": "${sourceDir}/install",
-            "generator": "Unix Makefiles"
+            "generator": "Unix Makefiles",
+            "cacheVariables": {
+                "RESOLVE_USE_CUDA": "ON"
+            }
         },
         {
             "name": "cpu",
@@ -20,11 +23,7 @@
             "description": "Base config to build without GPUs",
             "binaryDir": "${sourceDir}/build",
             "installDir": "${sourceDir}/install",
-            "generator": "Unix Makefiles",
-            "cacheVariables": {
-                "RESOLVE_USE_CUDA": "OFF",
-                "RESOLVE_USE_GPU": "OFF"
-            }
+            "generator": "Unix Makefiles"
         },
 	      {
             "name": "ascent",
diff --git a/cmake/ReSolveConfig.cmake.in b/cmake/ReSolveConfig.cmake.in
index 7a162d90..47f9fe35 100644
--- a/cmake/ReSolveConfig.cmake.in
+++ b/cmake/ReSolveConfig.cmake.in
@@ -12,6 +12,18 @@ if(@RESOLVE_USE_CUDA@)
    check_language(CUDA)
    set(CMAKE_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@")
    find_package(CUDAToolkit REQUIRED)
+   add_library(ReSolve::CUDA ALIAS ReSolve::resolve_backend_cuda)
+endif()
+if(@RESOLVE_USE_HIP@)
+  enable_language(HIP)
+  check_language(HIP)
+  find_package(hip REQUIRED)
+  find_package(hipblas REQUIRED)
+  # This is just an agly hack to make HIP build work
+  get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
+  message(STATUS "HIP include directories: ${hip_includes}")
+  include_directories(${hip_includes})
+  add_library(ReSolve::HIP ALIAS ReSolve::resolve_backend_hip)
 endif()
 
 # Compute installation prefix relative to this file.
diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
index e754da0d..4cb0c443 100644
--- a/cmake/ReSolveFindHipLibraries.cmake
+++ b/cmake/ReSolveFindHipLibraries.cmake
@@ -4,20 +4,14 @@
 add_library(resolve_hip INTERFACE)
 
 find_package(hip REQUIRED)
-find_package(hipblas REQUIRED)
+find_package(rocblas REQUIRED)
+find_package(rocsparse REQUIRED)
 
 target_link_libraries(resolve_hip INTERFACE
-  #hip::host 
+  hip::host 
   hip::device
-  rocblas
-  rocsparse
-  #roc::hipblas
+  roc::rocblas
+  roc::rocsparse
 )
 
-# get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
-# message(STATUS "HIP include directories: ${hip_includes}")
-
-# get_target_property(resolve_hip_includes resolve_hip INTERFACE_INCLUDE_DIRECTORIES)
-# message(STATUS "ReSolve HIP include directories: ${resolve_hip_includes}")
-
 install(TARGETS resolve_hip EXPORT ReSolveTargets)
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index fa6c9cd5..68b557b8 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -37,21 +37,6 @@ set(ReSolve_HEADER_INSTALL
     MemoryUtils.hpp
 )
 
-# If GPU support is not enabled, add dummy device backend
-if(NOT RESOLVE_USE_GPU)
-    add_subdirectory(cpu)
-endif()
-
-# If CUDA support is enabled, create CUDA backend
-# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
-if(RESOLVE_USE_CUDA)
-    add_subdirectory(cuda)
-endif()
-
-if(RESOLVE_USE_HIP)
-    add_subdirectory(hip)
-endif()
-
 # Now, build workspaces
 add_subdirectory(workspace)
 
@@ -59,23 +44,13 @@ add_subdirectory(workspace)
 add_subdirectory(vector)
 add_subdirectory(matrix)
 
-
 # Build shared library ReSolve
 add_library(resolve_tpl INTERFACE)
 
 if(RESOLVE_USE_KLU)
-    target_link_libraries(resolve_tpl INTERFACE KLU)    
+  target_link_libraries(resolve_tpl INTERFACE KLU)    
 endif(RESOLVE_USE_KLU)
 
-if(RESOLVE_USE_CUDA)
-    target_link_libraries(resolve_tpl INTERFACE resolve_cuda)
-endif(RESOLVE_USE_CUDA)
-
-if(RESOLVE_USE_HIP)
-    target_link_libraries(resolve_tpl INTERFACE resolve_hip)
-endif(RESOLVE_USE_HIP)
-
-
 set(ReSolve_Targets_List
     resolve_matrix
     resolve_vector
@@ -86,18 +61,23 @@ set(ReSolve_Targets_List
 
 # If CUDA support is enabled add CUDA SDK specific code and dependencies
 if(RESOLVE_USE_CUDA)
+  add_subdirectory(cuda)
+  target_link_libraries(resolve_tpl INTERFACE resolve_cuda)
   set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC})
   set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
 endif()
 
 # If HIP support is enabled add HIP SDK specific code and dependencies
 if(RESOLVE_USE_HIP)
+  add_subdirectory(hip)
+  target_link_libraries(resolve_tpl INTERFACE resolve_hip)
   set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip)
 endif()
 
 # If no GPU support is enabled, link to dummy device backend
 if(NOT RESOLVE_USE_GPU)
-    set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
+  add_subdirectory(cpu)
+  set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
 endif()
 
 # Set installable targets

From e01ba3c90ccc3d7d4ad5bf22ed36ab7eb4c1b93b Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Wed, 1 Nov 2023 20:59:37 -0400
Subject: [PATCH 06/12] Fully functional ROCm-based LU solver (#52)

* rocsolver class

* rocsolver-rf functionality test runs

* rocsolver EXAMPLE

* Cleanup before exiting examples and avoid double delete. (#49)

* Enable HIP support in Gramm-Schmidt and FGMRES (#50)

* Enable HIP in GS and FGMRES

* Do not build FGMRES without GPU.

---------

Co-authored-by: kswirydo <kasia.swirydowicz@gmail.com>

---------

Co-authored-by: kswirydo <kasia.swirydowicz@gmail.com>
---
 examples/CMakeLists.txt                       |  13 +
 examples/r_KLU_rf.cpp                         |   4 -
 examples/r_KLU_rf_FGMRES.cpp                  |  10 +-
 .../r_KLU_rf_FGMRES_reuse_factorization.cpp   |  10 +-
 examples/r_KLU_rocSolverRf_FGMRES.cpp         | 199 ++++++++++++++
 examples/r_KLU_rocsolverrf.cpp                | 177 ++++++++++++
 resolve/CMakeLists.txt                        |  19 +-
 resolve/GramSchmidt.cpp                       |  39 ++-
 resolve/LinSolverDirectRocSolverRf.cpp        | 205 ++++++++++++++
 resolve/LinSolverDirectRocSolverRf.hpp        |  59 ++++
 resolve/LinSolverIterativeFGMRES.cpp          |  39 +--
 resolve/LinSolverIterativeFGMRES.hpp          |  10 +-
 resolve/matrix/MatrixHandler.cpp              |   1 -
 resolve/matrix/MatrixHandlerHip.cpp           |   3 +-
 resolve/matrix/MatrixHandlerHip.hpp           |  10 +-
 resolve/vector/Vector.cpp                     |   4 +-
 tests/functionality/CMakeLists.txt            |  18 ++
 tests/functionality/testKLU_Rf_FGMRES.cpp     |   2 +-
 tests/functionality/testKLU_RocSolver.cpp     | 251 ++++++++++++++++++
 19 files changed, 1015 insertions(+), 58 deletions(-)
 create mode 100644 examples/r_KLU_rocSolverRf_FGMRES.cpp
 create mode 100644 examples/r_KLU_rocsolverrf.cpp
 create mode 100644 resolve/LinSolverDirectRocSolverRf.cpp
 create mode 100644 resolve/LinSolverDirectRocSolverRf.hpp
 create mode 100644 tests/functionality/testKLU_RocSolver.cpp

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 8e8a2498..9113ce17 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -39,7 +39,16 @@ if(RESOLVE_USE_CUDA)
 
 endif(RESOLVE_USE_CUDA)
 
+# Create HIP examples
+if(RESOLVE_USE_HIP)
+  # Build example with KLU factorization and rocsolver Rf refactorization
+  add_executable(klu_rocsolverrf.exe r_KLU_rocsolverrf.cpp)
+  target_link_libraries(klu_rocsolverrf.exe PRIVATE ReSolve)
 
+  # Build example with KLU factorization, rocsolver Rf refactorization, and FGMRES iterative refinement
+  add_executable(klu_rocsolverrf_fgmres.exe r_KLU_rocSolverRf_FGMRES.cpp)
+  target_link_libraries(klu_rocsolverrf_fgmres.exe PRIVATE ReSolve)
+endif(RESOLVE_USE_HIP)
 
 # Install all examples in bin directory
 set(installable_executables klu_klu.exe klu_klu_standalone.exe)
@@ -48,6 +57,10 @@ if(RESOLVE_USE_CUDA)
   set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe)        
 endif(RESOLVE_USE_CUDA)
 
+if(RESOLVE_USE_HIP)
+  set(installable_executables ${installable_executables} klu_rocsolverrf.exe)        
+endif(RESOLVE_USE_HIP)
+
 install(TARGETS ${installable_executables} 
         RUNTIME DESTINATION bin)
 
diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp
index d9310773..b61029c5 100644
--- a/examples/r_KLU_rf.cpp
+++ b/examples/r_KLU_rf.cpp
@@ -139,12 +139,8 @@ int main(int argc, char *argv[] )
         index_type* Q = KLU->getQOrdering();
         Rf->setup(A, L, U, P, Q); 
 
-        delete [] P;
-        delete [] Q;
         delete L;
-        delete L_csc;
         delete U;
-        delete U_csc;
       }
     } else {
       //status =  KLU->refactorize();
diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp
index 6df5419a..584fcd10 100644
--- a/examples/r_KLU_rf_FGMRES.cpp
+++ b/examples/r_KLU_rf_FGMRES.cpp
@@ -168,7 +168,7 @@ int main(int argc, char *argv[])
 
       //matrix_handler->setValuesChanged(true, "cuda");
       FGMRES->resetMatrix(A);
-      FGMRES->setupPreconditioner("CuSolverRf", Rf);
+      FGMRES->setupPreconditioner("LU", Rf);
       
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); 
 
@@ -189,8 +189,16 @@ int main(int argc, char *argv[])
 
   } // for (int i = 0; i < numSystems; ++i)
 
+  delete A;
+  delete KLU;
+  delete Rf;
   delete [] x;
   delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
 
   return 0;
 }
diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
index 5ead8186..c4ab285b 100644
--- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
+++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp
@@ -173,7 +173,7 @@ int main(int argc, char *argv[])
                   << status << std::endl;    
         vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
         status = Rf->solve(vec_rhs, vec_x);
-        FGMRES->setupPreconditioner("CuSolverRf", Rf);
+        FGMRES->setupPreconditioner("LU", Rf);
       }
       //if (i%2!=0)  vec_x->setToZero(ReSolve::memory::DEVICE);
       real_type norm_x =  vector_handler->dot(vec_x, vec_x, "cuda");
@@ -217,8 +217,16 @@ int main(int argc, char *argv[])
 
   }
 
+  delete A;
+  delete KLU;
+  delete Rf;
   delete [] x;
   delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_CUDA;
+  delete matrix_handler;
+  delete vector_handler;
 
   return 0;
 }
diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp
new file mode 100644
index 00000000..d2e5f7a6
--- /dev/null
+++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp
@@ -0,0 +1,199 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSolverRf.hpp>
+#include <resolve/LinSolverIterativeFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[])
+{
+  // Use the same data types as those you specified in ReSolve build.
+  using index_type = ReSolve::index_type;
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+
+  (void) argc; // TODO: Check if the number of input parameters is correct.
+  std::string  matrixFileName = argv[1];
+  std::string  rhsFileName = argv[2];
+
+  index_type numSystems = atoi(argv[3]);
+  std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<<numSystems<<std::endl;
+  std::cout<<"Family rhs file name: "<< rhsFileName << ", total number of RHSes: " << numSystems<<std::endl;
+
+  std::string fileId;
+  std::string rhsId;
+  std::string matrixFileNameFull;
+  std::string rhsFileNameFull;
+
+  ReSolve::matrix::Coo* A_coo;
+  ReSolve::matrix::Csr* A;
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP();
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
+
+  vector_type* vec_rhs;
+  vector_type* vec_x;
+  vector_type* vec_r;
+
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+  ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
+  ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP);
+  ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip");
+
+  for (int i = 0; i < numSystems; ++i)
+  {
+    index_type j = 4 + i * 2;
+    fileId = argv[j];
+    rhsId = argv[j + 1];
+
+    matrixFileNameFull = "";
+    rhsFileNameFull = "";
+
+    // Read matrix first
+    matrixFileNameFull = matrixFileName + fileId + ".mtx";
+    rhsFileNameFull = rhsFileName + rhsId + ".mtx";
+    std::cout << std::endl << std::endl << std::endl;
+    std::cout << "========================================================================================================================"<<std::endl;
+    std::cout << "Reading: " << matrixFileNameFull << std::endl;
+    std::cout << "========================================================================================================================"<<std::endl;
+    std::cout << std::endl;
+    // Read first matrix
+    std::ifstream mat_file(matrixFileNameFull);
+    if(!mat_file.is_open())
+    {
+      std::cout << "Failed to open file " << matrixFileNameFull << "\n";
+      return -1;
+    }
+    std::ifstream rhs_file(rhsFileNameFull);
+    if(!rhs_file.is_open())
+    {
+      std::cout << "Failed to open file " << rhsFileNameFull << "\n";
+      return -1;
+    }
+    if (i == 0) {
+      A_coo = ReSolve::io::readMatrixFromFile(mat_file);
+      A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                                   A_coo->getNumColumns(),
+                                   A_coo->getNnz(),
+                                   A_coo->symmetric(),
+                                   A_coo->expanded());
+
+      rhs = ReSolve::io::readRhsFromFile(rhs_file);
+      x = new real_type[A->getNumRows()];
+      vec_rhs = new vector_type(A->getNumRows());
+      vec_x = new vector_type(A->getNumRows());
+      vec_x->allocate(ReSolve::memory::HOST);//for KLU
+      vec_x->allocate(ReSolve::memory::DEVICE);
+      vec_r = new vector_type(A->getNumRows());
+    }
+    else {
+      ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
+      ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
+    }
+    std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+    mat_file.close();
+    rhs_file.close();
+
+    //Now convert to CSR.
+    if (i < 2) { 
+      matrix_handler->coo2csr(A_coo, A, "cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
+    } else { 
+      matrix_handler->coo2csr(A_coo,A, "hip");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+    }
+    std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
+    //Now call direct solver
+    if (i == 0) {
+      KLU->setupParameters(1, 0.1, false);
+    }
+    int status;
+    real_type norm_b;
+    if (i < 2){
+      KLU->setup(A);
+      matrix_handler->setValuesChanged(true, "hip");
+      status = KLU->analyze();
+      std::cout<<"KLU analysis status: "<<status<<std::endl;
+      status = KLU->factorize();
+      std::cout<<"KLU factorization status: "<<status<<std::endl;
+      status = KLU->solve(vec_rhs, vec_x);
+      std::cout<<"KLU solve status: "<<status<<std::endl;      
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+      norm_b = vector_handler->dot(vec_r, vec_r, "hip");
+      norm_b = sqrt(norm_b);
+      matrix_handler->setValuesChanged(true, "hip");
+      matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
+      printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "hip"))/norm_b);
+      if (i == 1) {
+        ReSolve::matrix::Csc* L /* _csc */ = (ReSolve::matrix::Csc*) KLU->getLFactor();
+        ReSolve::matrix::Csc* U /* _csc */ = (ReSolve::matrix::Csc*) KLU->getUFactor();
+        // ReSolve::matrix::Csr* L = new ReSolve::matrix::Csr(L_csc->getNumRows(), L_csc->getNumColumns(), L_csc->getNnz());
+        // ReSolve::matrix::Csr* U = new ReSolve::matrix::Csr(U_csc->getNumRows(), U_csc->getNumColumns(), U_csc->getNnz());
+        // matrix_handler->csc2csr(L_csc,L, "hip");
+        // matrix_handler->csc2csr(U_csc,U, "hip");
+        if (L == nullptr) {printf("ERROR");}
+        index_type* P = KLU->getPOrdering();
+        index_type* Q = KLU->getQOrdering();
+        Rf->setup(A, L, U, P, Q, vec_rhs);
+        Rf->refactorize();
+        std::cout<<"about to set FGMRES" <<std::endl;
+        GS->setup(A->getNumRows(), FGMRES->getRestart()); 
+        FGMRES->setup(A); 
+      }
+    } else {
+      //status =  KLU->refactorize();
+      std::cout<<"Using ROCSOLVER RF"<<std::endl;
+      status = Rf->refactorize();
+      std::cout<<"ROCSOLVER RF refactorization status: "<<status<<std::endl;      
+      status = Rf->solve(vec_rhs, vec_x);
+      std::cout<<"ROCSOLVER RF solve status: "<<status<<std::endl;      
+
+      vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+      norm_b = vector_handler->dot(vec_r, vec_r, "hip");
+      norm_b = sqrt(norm_b);
+
+      //matrix_handler->setValuesChanged(true, "hip");
+      FGMRES->resetMatrix(A);
+      FGMRES->setupPreconditioner("LU", Rf);
+      
+      matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
+      real_type rnrm = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+      std::cout << "\t 2-Norm of the residual (before IR): " 
+                << std::scientific << std::setprecision(16) 
+                << rnrm/norm_b << "\n";
+
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+     if(!std::isnan(rnrm) && !std::isinf(rnrm)) {
+      FGMRES->solve(vec_rhs, vec_x);
+
+      std::cout << "FGMRES: init nrm: " 
+                << std::scientific << std::setprecision(16) 
+                << FGMRES->getInitResidualNorm()/norm_b
+                << " final nrm: "
+                << FGMRES->getFinalResidualNorm()/norm_b
+                << " iter: " << FGMRES->getNumIter() << "\n";
+     }
+     }
+
+  } // for (int i = 0; i < numSystems; ++i)
+
+  delete [] x;
+  delete [] rhs;
+
+  return 0;
+}
diff --git a/examples/r_KLU_rocsolverrf.cpp b/examples/r_KLU_rocsolverrf.cpp
new file mode 100644
index 00000000..b3ebbecf
--- /dev/null
+++ b/examples/r_KLU_rocsolverrf.cpp
@@ -0,0 +1,177 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSolverRf.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[] )
+{
+  // Use the same data types as those you specified in ReSolve build.
+  using index_type = ReSolve::index_type;
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+
+  (void) argc; // TODO: Check if the number of input parameters is correct.
+  std::string  matrixFileName = argv[1];
+  std::string  rhsFileName = argv[2];
+
+  index_type numSystems = atoi(argv[3]);
+  std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<<numSystems<<std::endl;
+  std::cout<<"Family rhs file name: "<< rhsFileName << ", total number of RHSes: " << numSystems<<std::endl;
+
+  std::string fileId;
+  std::string rhsId;
+  std::string matrixFileNameFull;
+  std::string rhsFileNameFull;
+
+  ReSolve::matrix::Coo* A_coo;
+  ReSolve::matrix::Csr* A;
+
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP;
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+  real_type* rhs = nullptr;
+  real_type* x   = nullptr;
+
+  vector_type* vec_rhs;
+  vector_type* vec_x;
+  vector_type* vec_r;
+
+  ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
+  ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP);
+
+  for (int i = 0; i < numSystems; ++i)
+  {
+    index_type j = 4 + i * 2;
+    fileId = argv[j];
+    rhsId = argv[j + 1];
+
+    matrixFileNameFull = "";
+    rhsFileNameFull = "";
+
+    // Read matrix first
+    matrixFileNameFull = matrixFileName + fileId + ".mtx";
+    rhsFileNameFull = rhsFileName + rhsId + ".mtx";
+    std::cout << std::endl << std::endl << std::endl;
+    std::cout << "========================================================================================================================"<<std::endl;
+    std::cout << "Reading: " << matrixFileNameFull << std::endl;
+    std::cout << "========================================================================================================================"<<std::endl;
+    std::cout << std::endl;
+    // Read first matrix
+    std::ifstream mat_file(matrixFileNameFull);
+    if(!mat_file.is_open())
+    {
+      std::cout << "Failed to open file " << matrixFileNameFull << "\n";
+      return -1;
+    }
+    std::ifstream rhs_file(rhsFileNameFull);
+    if(!rhs_file.is_open())
+    {
+      std::cout << "Failed to open file " << rhsFileNameFull << "\n";
+      return -1;
+    }
+    if (i == 0) {
+      A_coo = ReSolve::io::readMatrixFromFile(mat_file);
+      A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                                   A_coo->getNumColumns(),
+                                   A_coo->getNnz(),
+                                   A_coo->symmetric(),
+                                   A_coo->expanded());
+
+      rhs = ReSolve::io::readRhsFromFile(rhs_file);
+      x = new real_type[A->getNumRows()];
+      vec_rhs = new vector_type(A->getNumRows());
+      vec_x = new vector_type(A->getNumRows());
+      vec_r = new vector_type(A->getNumRows());
+    }
+    else {
+      ReSolve::io::readAndUpdateMatrix(mat_file, A_coo);
+      ReSolve::io::readAndUpdateRhs(rhs_file, &rhs);
+    }
+    std::cout<<"Finished reading the matrix and rhs, size: "<<A->getNumRows()<<" x "<<A->getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<<A->symmetric()<< ", Expanded? "<<A->expanded()<<std::endl;
+    mat_file.close();
+    rhs_file.close();
+
+    //Now convert to CSR.
+    if (i < 2) { 
+      matrix_handler->coo2csr(A_coo, A, "cpu");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+      vec_rhs->setDataUpdated(ReSolve::memory::HOST);
+    } else { 
+      matrix_handler->coo2csr(A_coo, A, "hip");
+      vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+    }
+    std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<<std::endl;
+    //Now call direct solver
+    if (i == 0) {
+      KLU->setupParameters(1, 0.1, false);
+    }
+    int status;
+    if (i < 2){
+      KLU->setup(A);
+      status = KLU->analyze();
+      std::cout<<"KLU analysis status: "<<status<<std::endl;
+      status = KLU->factorize();
+      std::cout<<"KLU factorization status: "<<status<<std::endl;
+      status = KLU->solve(vec_rhs, vec_x);
+      std::cout<<"KLU solve status: "<<status<<std::endl;      
+      if (i == 1) {
+        ReSolve::matrix::Csc* L = (ReSolve::matrix::Csc*) KLU->getLFactor();
+        ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor();
+        index_type* P = KLU->getPOrdering();
+        index_type* Q = KLU->getQOrdering();
+        vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+        Rf->setup(A, L, U, P, Q, vec_rhs); 
+        Rf->refactorize();
+       //dont do it here 
+      //  delete [] P;
+      //  delete [] Q;
+      }
+    } else {
+      //status =  KLU->refactorize();
+      std::cout<<"Using rocsolver rf"<<std::endl;
+      status = Rf->refactorize();
+      std::cout<<"rocsolver rf refactorization status: "<<status<<std::endl;      
+      status = Rf->solve(vec_rhs, vec_x);
+      std::cout<<"rocsolver rf solve status: "<<status<<std::endl;      
+      //std::cout<<"KLU re-factorization status: "<<status<<std::endl;
+      //status = KLU->solve(vec_rhs, vec_x);
+      //std::cout<<"KLU solve status: "<<status<<std::endl;      
+    }
+    vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+
+    matrix_handler->setValuesChanged(true, "hip");
+
+    matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
+
+    std::cout << "\t 2-Norm of the residual: " 
+              << std::scientific << std::setprecision(16) 
+              << sqrt(vector_handler->dot(vec_r, vec_r, "hip")) << "\n";
+
+  } // for (int i = 0; i < numSystems; ++i)
+
+  //now DELETE
+  delete A;
+  delete KLU;
+  delete Rf;
+  delete [] x;
+  delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
+  return 0;
+}
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 68b557b8..47ce70de 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -14,14 +14,21 @@ set(ReSolve_SRC
     LinSolverDirectKLU.cpp
 )
 
+# Temporary until there is CPU-only option for FGMRES
+set(ReSolve_GPU_SRC
+    GramSchmidt.cpp
+    LinSolverIterativeFGMRES.cpp
+)
+
 # C++ code that links to CUDA SDK libraries
 set(ReSolve_CUDASDK_SRC
-    LinSolverIterativeFGMRES.cpp
-    GramSchmidt.cpp
     LinSolverDirectCuSolverGLU.cpp
     LinSolverDirectCuSolverRf.cpp
 )
-
+# HIP files
+set(ReSolve_ROCM_SRC
+  LinSolverDirectRocSolverRf.cpp
+)
 # Header files to be installed
 set(ReSolve_HEADER_INSTALL
     Common.hpp
@@ -59,6 +66,11 @@ set(ReSolve_Targets_List
     resolve_workspace
 )
 
+# Temporary until there is CPU-only option for FGMRES
+if(RESOLVE_USE_GPU)
+  set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_GPU_SRC})
+endif()
+
 # If CUDA support is enabled add CUDA SDK specific code and dependencies
 if(RESOLVE_USE_CUDA)
   add_subdirectory(cuda)
@@ -71,6 +83,7 @@ endif()
 if(RESOLVE_USE_HIP)
   add_subdirectory(hip)
   target_link_libraries(resolve_tpl INTERFACE resolve_hip)
+  set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_ROCM_SRC})
   set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip)
 endif()
 
diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp
index fb86fc8d..8f6f0850 100644
--- a/resolve/GramSchmidt.cpp
+++ b/resolve/GramSchmidt.cpp
@@ -127,7 +127,7 @@ namespace ReSolve
   {
     using namespace constants;
 
-    if (memspace == "cuda") { // or hip
+    if ((memspace == "cuda") || (memspace == "hip")) { // or hip
 
       double t;
       double s;
@@ -139,19 +139,19 @@ namespace ReSolve
           for(int j = 0; j <= i; ++j) {
             t = 0.0;
             vec_v_->setData( V->getVectorData(j, memory::DEVICE), memory::DEVICE);
-            t = vector_handler_->dot(vec_v_, vec_w_, "cuda");  
+            t = vector_handler_->dot(vec_v_, vec_w_, memspace);  
             H[ idxmap(i, j, num_vecs_ + 1) ] = t; 
             t *= -1.0;
-            vector_handler_->axpy(&t, vec_v_, vec_w_, "cuda");  
+            vector_handler_->axpy(&t, vec_v_, vec_w_, memspace);  
           }
           t = 0.0;
-          t = vector_handler_->dot(vec_w_, vec_w_, "cuda");  
+          t = vector_handler_->dot(vec_w_, vec_w_, memspace);
           //set the last entry in Hessenberg matrix
           t = sqrt(t);
           H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; 
           if(fabs(t) > EPSILON) {
             t = 1.0/t;
-            vector_handler_->scal(&t, vec_w_, "cuda");  
+            vector_handler_->scal(&t, vec_w_, memspace);  
           } else {
             assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n");
             return -1;
@@ -160,10 +160,9 @@ namespace ReSolve
         case cgs2:
 
           vec_v_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
-          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, "cuda");
-
+          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, memspace);
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
-          vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" );  
+          vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace );  
 
           // copy H_col to aux, we will need it later
           vec_Hcolumn_->setDataUpdated(memory::DEVICE);
@@ -171,10 +170,10 @@ namespace ReSolve
           vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST);
 
           //Hcol = V(:,1:i)^T*V(:,i+1);
-          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, "cuda");
+          vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V,  vec_v_, vec_Hcolumn_, memspace);
 
           // V(:,i+1) = V(:, i+1) -  V(:,1:i)*Hcol
-          vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" );  
+          vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace );  
 
           // copy H_col to H
           vec_Hcolumn_->setDataUpdated(memory::DEVICE);
@@ -186,13 +185,13 @@ namespace ReSolve
             H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; 
           }
 
-          t = vector_handler_->dot(vec_v_, vec_v_, "cuda");  
+          t = vector_handler_->dot(vec_v_, vec_v_, memspace);  
           //set the last entry in Hessenberg matrix
           t = sqrt(t);
           H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; 
           if(fabs(t) > EPSILON) {
             t = 1.0/t;
-            vector_handler_->scal(&t, vec_v_, "cuda");  
+            vector_handler_->scal(&t, vec_v_, memspace);  
           } else {
             assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n");
             return -1;
@@ -205,7 +204,7 @@ namespace ReSolve
           vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
           vec_rv_->setCurrentSize(i + 1);
 
-          vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda");
+          vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace);
           vec_rv_->setDataUpdated(memory::DEVICE);
           vec_rv_->copyData(memory::DEVICE, memory::HOST);
 
@@ -226,16 +225,16 @@ namespace ReSolve
           }   // for j
           vec_Hcolumn_->setCurrentSize(i + 1);
           vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); 
-          vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda");
+          vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, memspace);
 
           // normalize (second synch)
-          t = vector_handler_->dot(vec_w_, vec_w_, "cuda");  
+          t = vector_handler_->dot(vec_w_, vec_w_, memspace);  
           //set the last entry in Hessenberg matrix
           t = sqrt(t);
           H[ idxmap(i, i + 1, num_vecs_ + 1)] = t;    
           if(fabs(t) > EPSILON) {
             t = 1.0 / t;
-            vector_handler_->scal(&t, vec_w_, "cuda");  
+            vector_handler_->scal(&t, vec_w_, memspace);  
           } else {
             assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n");
             return -1;
@@ -247,7 +246,7 @@ namespace ReSolve
           vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
           vec_rv_->setCurrentSize(i + 1);
 
-          vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda");
+          vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace);
           vec_rv_->setDataUpdated(memory::DEVICE);
           vec_rv_->copyData(memory::DEVICE, memory::HOST);
 
@@ -297,15 +296,15 @@ namespace ReSolve
           vec_Hcolumn_->setCurrentSize(i + 1);
           vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); 
 
-          vector_handler_->massAxpy(n, vec_Hcolumn_, i, V,  vec_w_, "cuda");
+          vector_handler_->massAxpy(n, vec_Hcolumn_, i, V,  vec_w_, memspace);
           // normalize (second synch)
-          t = vector_handler_->dot(vec_w_, vec_w_, "cuda");  
+          t = vector_handler_->dot(vec_w_, vec_w_, memspace);  
           //set the last entry in Hessenberg matrix
           t = sqrt(t);
           H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t;    
           if(fabs(t) > EPSILON) {
             t = 1.0 / t;
-            vector_handler_->scal(&t, vec_w_, "cuda");  
+            vector_handler_->scal(&t, vec_w_, memspace);  
           } else {
             assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n");
             return -1;
diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp
new file mode 100644
index 00000000..5869756d
--- /dev/null
+++ b/resolve/LinSolverDirectRocSolverRf.cpp
@@ -0,0 +1,205 @@
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include "LinSolverDirectRocSolverRf.hpp"
+
+namespace ReSolve 
+{
+  LinSolverDirectRocSolverRf::LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace)
+  {
+    workspace_ = workspace;
+    infoM_ = nullptr;
+    solve_mode_ = 0; //solve mode - slow mode is default
+  }
+
+  LinSolverDirectRocSolverRf::~LinSolverDirectRocSolverRf()
+  {
+    mem_.deleteOnDevice(d_P_);
+    mem_.deleteOnDevice(d_Q_);
+  }
+
+  int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs)
+  {
+    //remember - P and Q are generally CPU variables
+    int error_sum = 0;
+    this->A_ = (matrix::Csr*) A;
+    index_type n = A_->getNumRows();
+    //set matrix info
+    rocsolver_create_rfinfo(&infoM_, workspace_->getRocblasHandle()); 
+    //create combined factor
+    addFactors(L,U);
+    M_->setUpdated(ReSolve::memory::HOST);
+    M_->copyData(ReSolve::memory::DEVICE);
+    mem_.allocateArrayOnDevice(&d_P_, n); 
+    mem_.allocateArrayOnDevice(&d_Q_, n);
+
+    mem_.copyArrayHostToDevice(d_P_, P, n);
+    mem_.copyArrayHostToDevice(d_Q_, Q, n);
+
+    mem_.deviceSynchronize();
+    status_rocblas_ = rocsolver_dcsrrf_analysis(workspace_->getRocblasHandle(),
+                                                n,
+                                                1,
+                                                A_->getNnzExpanded(),
+                                                A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_,
+                                                A_->getColData(ReSolve::memory::DEVICE), //jCol_, 
+                                                A_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                M_->getNnzExpanded(),
+                                                M_->getRowData(ReSolve::memory::DEVICE), 
+                                                M_->getColData(ReSolve::memory::DEVICE), 
+                                                M_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                d_P_,
+                                                d_Q_,
+                                                rhs->getData(ReSolve::memory::DEVICE), 
+                                                n,
+                                                infoM_);
+
+    mem_.deviceSynchronize();
+    error_sum += status_rocblas_;
+
+
+    return error_sum;
+  }
+
+  int LinSolverDirectRocSolverRf::refactorize()
+  {
+    int error_sum = 0;
+    mem_.deviceSynchronize();
+    status_rocblas_ =  rocsolver_dcsrrf_refactlu(workspace_->getRocblasHandle(),
+                                                 A_->getNumRows(),
+                                                 A_->getNnzExpanded(),
+                                                 A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_,
+                                                 A_->getColData(ReSolve::memory::DEVICE), //jCol_, 
+                                                 A_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                 M_->getNnzExpanded(),
+                                                 M_->getRowData(ReSolve::memory::DEVICE), 
+                                                 M_->getColData(ReSolve::memory::DEVICE), 
+                                                 M_->getValues(ReSolve::memory::DEVICE), //OUTPUT, 
+                                                 d_P_,
+                                                 d_Q_,
+                                                 infoM_);
+
+    mem_.deviceSynchronize();
+    error_sum += status_rocblas_;
+
+    return error_sum; 
+  }
+
+  // solution is returned in RHS
+  int LinSolverDirectRocSolverRf::solve(vector_type* rhs)
+  {
+    if (solve_mode_ == 0) {
+      mem_.deviceSynchronize();
+      status_rocblas_ =  rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(),
+                                                A_->getNumRows(),
+                                                1,
+                                                M_->getNnz(),
+                                                M_->getRowData(ReSolve::memory::DEVICE), 
+                                                M_->getColData(ReSolve::memory::DEVICE), 
+                                                M_->getValues(ReSolve::memory::DEVICE), 
+                                                d_P_,
+                                                d_Q_,
+                                                rhs->getData(ReSolve::memory::DEVICE),
+                                                A_->getNumRows(),
+                                                infoM_);
+      mem_.deviceSynchronize();
+    } else {
+      // not implemented yet
+    }
+    return status_rocblas_;
+  }
+
+  int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x)
+  {
+    x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE);
+    x->setDataUpdated(ReSolve::memory::DEVICE);
+
+    if (solve_mode_ == 0) {
+      mem_.deviceSynchronize();
+      status_rocblas_ =  rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(),
+                                                A_->getNumRows(),
+                                                1,
+                                                M_->getNnz(),
+                                                M_->getRowData(ReSolve::memory::DEVICE), 
+                                                M_->getColData(ReSolve::memory::DEVICE),  
+                                                M_->getValues(ReSolve::memory::DEVICE), 
+                                                d_P_,
+                                                d_Q_,
+                                                x->getData(ReSolve::memory::DEVICE),
+                                                A_->getNumRows(),
+                                                infoM_);
+      mem_.deviceSynchronize();
+    } else {
+      // not implemented yet
+    }
+    return status_rocblas_;
+  }
+
+  int LinSolverDirectRocSolverRf::setSolveMode(int mode)
+  {
+    solve_mode_ = mode;
+    return 0;
+  }
+
+  int LinSolverDirectRocSolverRf::getSolveMode()
+  {
+    return solve_mode_;
+  }
+
+  void LinSolverDirectRocSolverRf::addFactors(matrix::Sparse* L, matrix::Sparse* U)
+  {
+    // L and U need to be in CSC format
+    index_type n = L->getNumRows();
+    index_type* Lp = L->getColData(ReSolve::memory::HOST); 
+    index_type* Li = L->getRowData(ReSolve::memory::HOST); 
+    index_type* Up = U->getColData(ReSolve::memory::HOST); 
+    index_type* Ui = U->getRowData(ReSolve::memory::HOST); 
+
+    index_type nnzM = ( L->getNnz() + U->getNnz() - n );
+    M_ = new matrix::Csr(n, n, nnzM);
+    M_->allocateMatrixData(ReSolve::memory::DEVICE);
+    M_->allocateMatrixData(ReSolve::memory::HOST);
+    index_type* mia = M_->getRowData(ReSolve::memory::HOST);
+    index_type* mja = M_->getColData(ReSolve::memory::HOST);
+    index_type row;
+    for(index_type i = 0; i < n; ++i) {
+      // go through EACH COLUMN OF L first
+      for(index_type j = Lp[i]; j < Lp[i + 1]; ++j) {
+        row = Li[j];
+        // BUT dont count diagonal twice, important
+        if(row != i) {
+          mia[row + 1]++;
+        }
+      }
+      // then each column of U
+      for(index_type j = Up[i]; j < Up[i + 1]; ++j) {
+        row = Ui[j];
+        mia[row + 1]++;
+      }
+    }
+    // then organize mia_;
+    mia[0] = 0;
+    for(index_type i = 1; i < n + 1; i++) {
+      mia[i] += mia[i - 1];
+    }
+
+    std::vector<int> Mshifts(n, 0);
+    for(index_type i = 0; i < n; ++i) {
+      // go through EACH COLUMN OF L first
+      for(int j = Lp[i]; j < Lp[i + 1]; ++j) {
+        row = Li[j];
+        if(row != i) {
+          // place (row, i) where it belongs!
+          mja[mia[row] + Mshifts[row]] = i;
+          Mshifts[row]++;
+        }
+      }
+      // each column of U next
+      for(index_type j = Up[i]; j < Up[i + 1]; ++j) {
+        row = Ui[j];
+        mja[mia[row] + Mshifts[row]] = i;
+        Mshifts[row]++;
+      }
+    }
+    //Mshifts.~vector(); 
+  }
+}// namespace resolve
diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp
new file mode 100644
index 00000000..5804393f
--- /dev/null
+++ b/resolve/LinSolverDirectRocSolverRf.hpp
@@ -0,0 +1,59 @@
+#pragma once
+#include "Common.hpp"
+#include "LinSolver.hpp"
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+
+#include <rocsparse/rocsparse.h>
+#include <rocblas/rocblas.h>
+#include <rocsolver/rocsolver.h>
+#include <hip/hip_runtime.h>
+#include <roctracer/roctx.h>
+
+namespace ReSolve 
+{
+  // Forward declaration of vector::Vector class
+  namespace vector
+  {
+    class Vector;
+  }
+
+  // Forward declaration of matrix::Sparse class
+  namespace matrix
+  {
+    class Sparse;
+  }
+  
+  class LinSolverDirectRocSolverRf : public LinSolverDirect 
+  {
+    using vector_type = vector::Vector;
+    
+    public: 
+      LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace);
+      ~LinSolverDirectRocSolverRf();
+      
+      int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs);
+       
+      int refactorize();
+      int solve(vector_type* rhs, vector_type* x);
+      int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten)
+    
+      int setSolveMode(int mode); // should probably be enum 
+      int getSolveMode(); //should be enum too
+
+    private:
+      rocblas_status status_rocblas_;
+      
+      index_type* d_P_;
+      index_type* d_Q_;
+
+      MemoryHandler mem_; ///< Device memory manager object
+      LinAlgWorkspaceHIP* workspace_; 
+
+      // to be exported to matrix handler in a later time
+      void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors
+      rocsolver_rfinfo infoM_;
+      matrix::Sparse* M_;//the matrix that contains added factors
+      int solve_mode_;
+  };
+}
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index 0bf1720f..40fdb22c 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -10,8 +10,9 @@ namespace ReSolve
 {
   using out = io::Logger;
 
-  LinSolverIterativeFGMRES::LinSolverIterativeFGMRES()
+  LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(std::string memspace)
   {
+    memspace_ = memspace;
     this->matrix_handler_ = nullptr;
     this->vector_handler_ = nullptr;
     tol_ = 1e-14; //default
@@ -25,8 +26,10 @@ namespace ReSolve
 
   LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(MatrixHandler* matrix_handler,
                                                      VectorHandler* vector_handler,
-                                                     GramSchmidt*   gs)
+                                                     GramSchmidt*   gs,
+                                                     std::string memspace)
   {
+    memspace_ = memspace;
     this->matrix_handler_ = matrix_handler;
     this->vector_handler_ = vector_handler;
     this->GS_ = gs;
@@ -46,8 +49,10 @@ namespace ReSolve
                                                      index_type conv_cond,
                                                      MatrixHandler* matrix_handler,
                                                      VectorHandler* vector_handler,
-                                                     GramSchmidt*   gs)
+                                                     GramSchmidt*   gs,
+                                                     std::string memspace)
   {
+    memspace_ = memspace;
     this->matrix_handler_ = matrix_handler;
     this->vector_handler_ = vector_handler;
     this->GS_ = gs;
@@ -113,12 +118,15 @@ namespace ReSolve
     vector_type* vec_v = new vector_type(n_);
     vector_type* vec_z = new vector_type(n_);
     //V[0] = b-A*x_0
+    //debug
+    d_Z_->setToZero(memory::DEVICE);
+    d_V_->setToZero(memory::DEVICE);
 
     rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
-    matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", "cuda"); 
+    matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", memspace_); 
     rnorm = 0.0;
-    bnorm = vector_handler_->dot(rhs, rhs, "cuda");
-    rnorm = vector_handler_->dot(d_V_, d_V_, "cuda");
+    bnorm = vector_handler_->dot(rhs, rhs, memspace_);
+    rnorm = vector_handler_->dot(d_V_, d_V_, memspace_);
 
     //rnorm = ||V_1||
     rnorm = sqrt(rnorm);
@@ -154,7 +162,7 @@ namespace ReSolve
 
       // normalize first vector
       t = 1.0 / rnorm;
-      vector_handler_->scal(&t, d_V_, "cuda");
+      vector_handler_->scal(&t, d_V_, memspace_);
       // initialize norm history
       h_rs_[0] = rnorm;
       i = -1;
@@ -175,11 +183,11 @@ namespace ReSolve
 
         vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE);
 
-        matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", "cuda"); 
+        matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", memspace_); 
 
         // orthogonalize V[i+1], form a column of h_H_
 
-        GS_->orthogonalize(n_, d_V_, h_H_, i, "cuda");  ;
+        GS_->orthogonalize(n_, d_V_, h_H_, i, memspace_);  ;
         if(i != 0) {
           for(int k = 1; k <= i; k++) {
             k1 = k - 1;
@@ -188,7 +196,6 @@ namespace ReSolve
             h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k];
           }
         } // if i!=0
-
         double Hii = h_H_[i * (restart_ + 1) + i];
         double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1];
         double gam = sqrt(Hii * Hii + Hii1 * Hii1);
@@ -229,7 +236,7 @@ namespace ReSolve
       // get solution
       for(j = 0; j <= i; j++) {
         vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE);
-        vector_handler_->axpy(&h_rs_[j], vec_z, x, "cuda");
+        vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_);
       }
 
       /* test solution */
@@ -240,8 +247,8 @@ namespace ReSolve
       }
 
       rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE);  
-      matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", "cuda"); 
-      rnorm = vector_handler_->dot(d_V_, d_V_, "cuda");
+      matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", memspace_); 
+      rnorm = vector_handler_->dot(d_V_, d_V_, memspace_);
       // rnorm = ||V_1||
       rnorm = sqrt(rnorm);
 
@@ -253,9 +260,9 @@ namespace ReSolve
     return 0;
   }
 
-  int  LinSolverIterativeFGMRES::setupPreconditioner(std::string name, LinSolverDirect* LU_solver)
+  int  LinSolverIterativeFGMRES::setupPreconditioner(std::string type, LinSolverDirect* LU_solver)
   {
-    if (name != "CuSolverRf") {
+    if (type != "LU") {
       out::warning() << "Only cusolverRf tri solve can be used as a preconditioner at this time." << std::endl;
       return 1;
     } else {
@@ -308,7 +315,7 @@ namespace ReSolve
   int  LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix)
   {
     A_ = new_matrix;
-    matrix_handler_->setValuesChanged(true, "cuda");
+    matrix_handler_->setValuesChanged(true, memspace_);
     return 0;
   }
 
diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp
index 8b2c722d..a9fc5058 100644
--- a/resolve/LinSolverIterativeFGMRES.hpp
+++ b/resolve/LinSolverIterativeFGMRES.hpp
@@ -13,17 +13,19 @@ namespace ReSolve
     using vector_type = vector::Vector;
 
     public:
-    LinSolverIterativeFGMRES();
+    LinSolverIterativeFGMRES(std::string memspace = "cuda");
     LinSolverIterativeFGMRES( MatrixHandler* matrix_handler,
                               VectorHandler* vector_handler,
-                              GramSchmidt*   gs);
+                              GramSchmidt*   gs,
+                              std::string memspace = "cuda");
     LinSolverIterativeFGMRES(index_type restart,
                              real_type  tol,
                              index_type maxit,
                              index_type conv_cond,
                              MatrixHandler* matrix_handler,
                              VectorHandler* vector_handler,
-                             GramSchmidt*   gs);
+                             GramSchmidt*   gs,
+                             std::string memspace = "cuda");
     ~LinSolverIterativeFGMRES();
 
     int solve(vector_type* rhs, vector_type* x);
@@ -48,6 +50,8 @@ namespace ReSolve
     private:
     //remember matrix handler and vector handler are inherited.
 
+    std::string memspace_;
+
     real_type tol_;
     index_type maxit_;
     index_type restart_;
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index 0a7124da..b2d4339f 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -295,7 +295,6 @@ namespace ReSolve {
     } else if (memspace == "cpu") {
         return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else if (memspace == "hip") {
-      printf("about to run mv");
         return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat);
     } else {
         out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl;
diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp
index b4f8e483..ff10e973 100644
--- a/resolve/matrix/MatrixHandlerHip.cpp
+++ b/resolve/matrix/MatrixHandlerHip.cpp
@@ -49,7 +49,6 @@ namespace ReSolve {
       
       if (!workspaceHIP->matvecSetup()) {
         //setup first, allocate, etc.
-        
         rocsparse_create_mat_descr(&(descrA));
         rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero);
         rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general);
@@ -69,6 +68,8 @@ namespace ReSolve {
         error_sum += status;
         mem_.deviceSynchronize();
 
+        workspaceHIP->setSpmvMatrixDescriptor(descrA);
+        workspaceHIP->setSpmvMatrixInfo(infoA);
         workspaceHIP->matvecSetupDone();
       } 
       
diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp
index 7f06f3bd..37f11a7b 100644
--- a/resolve/matrix/MatrixHandlerHip.hpp
+++ b/resolve/matrix/MatrixHandlerHip.hpp
@@ -38,11 +38,11 @@ namespace ReSolve {
       int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr);
       
       virtual int matvec(matrix::Sparse* A,
-                 vector_type* vec_x,
-                 vector_type* vec_result,
-                 const real_type* alpha,
-                 const real_type* beta,
-                 std::string matrix_type);
+                         vector_type* vec_x,
+                         vector_type* vec_result,
+                         const real_type* alpha,
+                         const real_type* beta,
+                         std::string matrix_type);
       
       virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm);
       
diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp
index 0a62bd02..3b4f9e72 100644
--- a/resolve/vector/Vector.cpp
+++ b/resolve/vector/Vector.cpp
@@ -140,7 +140,8 @@ namespace ReSolve { namespace vector {
   real_type* Vector::getData(index_type i, memory::MemorySpace memspace)
   {
     if ((memspace == memory::HOST) && (cpu_updated_ == false) && (gpu_updated_ == true )) {
-      copyData(memspace, memory::HOST);
+      // remember IN FIRST OUT SECOND!!!
+      copyData(memory::DEVICE, memspace);  
       owns_cpu_data_ = true;
     } 
 
@@ -174,7 +175,6 @@ namespace ReSolve { namespace vector {
       //allocate first
       mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
-
     switch(control)  {
       case 0: //cpu->cuda
         mem_.copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_);
diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt
index a6652c26..acc5ce60 100644
--- a/tests/functionality/CMakeLists.txt
+++ b/tests/functionality/CMakeLists.txt
@@ -26,6 +26,15 @@ if(RESOLVE_USE_CUDA)
 
 endif(RESOLVE_USE_CUDA)
 
+
+if(RESOLVE_USE_HIP)
+        
+  # Build KLU+rossolver test
+  add_executable(rocsolver_rf_test.exe testKLU_RocSolver.cpp)
+  target_link_libraries(rocsolver_rf_test.exe PRIVATE ReSolve)
+
+endif(RESOLVE_USE_HIP)
+
 # Install tests
 set(installable_tests klu_klu_test.exe)
 
@@ -36,6 +45,11 @@ if(RESOLVE_USE_CUDA)
                         klu_glu_test.exe)
 endif(RESOLVE_USE_CUDA)
 
+if(RESOLVE_USE_HIP)
+  set(installable_tests ${installable_tests}
+                        rocsolver_rf_test.exe)
+endif(RESOLVE_USE_HIP)
+
 install(TARGETS ${installable_tests} 
         RUNTIME DESTINATION bin/resolve/tests/functionality)
 
@@ -50,3 +64,7 @@ if(RESOLVE_USE_CUDA)
   add_test(NAME klu_rf_fgmres_test COMMAND $<TARGET_FILE:klu_rf_fgmres_test.exe> "${test_data_dir}")
   add_test(NAME klu_glu_test COMMAND $<TARGET_FILE:klu_glu_test.exe> "${test_data_dir}")
 endif(RESOLVE_USE_CUDA)
+
+if(RESOLVE_USE_HIP)
+  add_test(NAME rocsolver_rf_test  COMMAND $<TARGET_FILE:rocsolver_rf_test.exe>  "${test_data_dir}")
+endif(RESOLVE_USE_HIP)
diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp
index 6a81dac1..2e582e02 100644
--- a/tests/functionality/testKLU_Rf_FGMRES.cpp
+++ b/tests/functionality/testKLU_Rf_FGMRES.cpp
@@ -213,7 +213,7 @@ int main(int argc, char *argv[])
   error_sum += status;
   
   FGMRES->resetMatrix(A);
-  status = FGMRES->setupPreconditioner("CuSolverRf", Rf);
+  status = FGMRES->setupPreconditioner("LU", Rf);
   error_sum += status;
 
   vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
diff --git a/tests/functionality/testKLU_RocSolver.cpp b/tests/functionality/testKLU_RocSolver.cpp
new file mode 100644
index 00000000..9fd43ac1
--- /dev/null
+++ b/tests/functionality/testKLU_RocSolver.cpp
@@ -0,0 +1,251 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSolverRf.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+//author: KS
+//functionality test to check whether rocsolver_rf works correctly.
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[])
+{
+  // Use ReSolve data types.
+  using index_type = ReSolve::index_type;
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+  using matrix_type = ReSolve::matrix::Sparse;
+
+  //we want error sum to be 0 at the end
+  //that means PASS.
+  //otheriwse it is a FAIL.
+  int error_sum = 0;
+  int status = 0;
+
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP();
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+
+  ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
+  KLU->setupParameters(1, 0.1, false);
+
+  ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP);
+  // Input to this code is location of `data` directory where matrix files are stored
+  const std::string data_path = (argc == 2) ? argv[1] : "./";
+
+
+  std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg200_AC_10.mtx";
+  std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg200_AC_11.mtx";
+
+  std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg200_AC_10.mtx.ones";
+  std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg200_AC_11.mtx.ones";
+
+  // Read first matrix
+  std::ifstream mat1(matrixFileName1);
+  if(!mat1.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName1 << "\n";
+    return -1;
+  }
+  ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1);
+  ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                                                     A_coo->getNumColumns(),
+                                                     A_coo->getNnz(),
+                                                     A_coo->symmetric(),
+                                                     A_coo->expanded());
+  mat1.close();
+
+  // Read first rhs vector
+  std::ifstream rhs1_file(rhsFileName1);
+  if(!rhs1_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName1 << "\n";
+    return -1;
+  }
+  real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
+  real_type* x   = new real_type[A->getNumRows()];
+  vector_type* vec_rhs = new vector_type(A->getNumRows());
+  vector_type* vec_x   = new vector_type(A->getNumRows());
+  vec_x->allocate(ReSolve::memory::HOST);//for KLU
+  vec_x->allocate(ReSolve::memory::DEVICE);
+  vector_type* vec_r   = new vector_type(A->getNumRows());
+  rhs1_file.close();
+
+  // Convert first matrix to CSR format
+  matrix_handler->coo2csr(A_coo, A, "cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
+
+  // Solve the first system using KLU
+  status = KLU->setup(A);
+  error_sum += status;
+
+  status = KLU->analyze();
+  error_sum += status;
+
+  status = KLU->factorize();
+  error_sum += status;
+
+  status = KLU->solve(vec_rhs, vec_x);
+  error_sum += status;
+
+  std::cout<<"KLU solve status: "<<status<<std::endl;      
+
+  matrix_type* L = KLU->getLFactor();
+  matrix_type* U = KLU->getUFactor();
+  if (L == nullptr) {printf("ERROR");}
+  index_type* P = KLU->getPOrdering();
+  index_type* Q = KLU->getQOrdering();
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_rhs->setDataUpdated(ReSolve::memory::DEVICE);
+
+  status = Rf->setup(A, L, U, P, Q, vec_rhs); 
+  error_sum += status;
+  std::cout<<"Rf setup status: "<<status<<std::endl;      
+
+  status = Rf->refactorize();
+  error_sum += status;
+  vector_type* vec_test;
+  vector_type* vec_diff;
+  vec_test  = new vector_type(A->getNumRows());
+  vec_diff  = new vector_type(A->getNumRows());
+  real_type* x_data = new real_type[A->getNumRows()];
+  for (int i=0; i<A->getNumRows(); ++i){
+    x_data[i] = 1.0;
+  }
+
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "hip"));
+  matrix_handler->setValuesChanged(true, "hip");
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); 
+  error_sum += status;
+
+  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+  //for testing only - control
+
+  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip"));
+  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip"));
+
+  //compute x-x_true
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip");
+  //evaluate its norm
+  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip"));
+
+  //compute the residual using exact solution
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
+  error_sum += status;
+  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+  //evaluate the residual ON THE CPU using COMPUTED solution
+
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
+  error_sum += status;
+
+  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+  std::cout<<"Results (first matrix): "<<std::endl<<std::endl;
+  std::cout<<"\t ||b-A*x||_2                 : " << std::setprecision(16) << normRmatrix1    << " (residual norm)" << std::endl;
+  std::cout<<"\t ||b-A*x||_2  (CPU)          : " << std::setprecision(16) << normRmatrix1CPU << " (residual norm)" << std::endl;
+  std::cout<<"\t ||b-A*x||_2/||b||_2         : " << normRmatrix1/normB1   << " (scaled residual norm)"             << std::endl;
+  std::cout<<"\t ||x-x_true||_2              : " << normDiffMatrix1       << " (solution error)"                   << std::endl;
+  std::cout<<"\t ||x-x_true||_2/||x_true||_2 : " << normDiffMatrix1/normXtrue << " (scaled solution error)"        << std::endl;
+  std::cout<<"\t ||b-A*x_exact||_2           : " << exactSol_normRmatrix1 << " (control; residual norm with exact solution)\n\n";
+
+
+  // Load the second matrix
+  std::ifstream mat2(matrixFileName2);
+  if(!mat2.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName2 << "\n";
+    return -1;
+  }
+  ReSolve::io::readAndUpdateMatrix(mat2, A_coo);
+  mat2.close();
+
+  // Load the second rhs vector
+  std::ifstream rhs2_file(rhsFileName2);
+  if(!rhs2_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName2 << "\n";
+    return -1;
+  }
+  ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs);
+  rhs2_file.close();
+
+  matrix_handler->coo2csr(A_coo, A, "hip");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+
+  // this hangs up
+  status = Rf->refactorize();
+  error_sum += status;
+
+  std::cout<<"rocSolverRf refactorization status: "<<status<<std::endl;      
+  status = Rf->solve(vec_rhs, vec_x);
+  error_sum += status;
+
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  matrix_handler->setValuesChanged(true, "hip");
+
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); 
+  error_sum += status;
+
+  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+  //for testing only - control
+  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip"));
+  //compute x-x_true
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip");
+  //evaluate its norm
+  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip"));
+
+  //compute the residual using exact solution
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); 
+  error_sum += status;
+  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+  std::cout<<"Results (second matrix): "<<std::endl<<std::endl;
+  std::cout<<"\t ||b-A*x||_2                 : "<<normRmatrix2<<" (residual norm)"<<std::endl;
+  std::cout<<"\t ||b-A*x||_2/||b||_2         : "<<normRmatrix2/normB2<<" (scaled residual norm)"<<std::endl;
+  std::cout<<"\t ||x-x_true||_2              : "<<normDiffMatrix2<<" (solution error)"<<std::endl;
+  std::cout<<"\t ||x-x_true||_2/||x_true||_2 : "<<normDiffMatrix2/normXtrue<<" (scaled solution error)"<<std::endl;
+  std::cout<<"\t ||b-A*x_exact||_2           : "<<exactSol_normRmatrix2<<" (control; residual norm with exact solution)"<<std::endl<<std::endl;
+
+
+
+  if ((error_sum == 0) && (normRmatrix1/normB1 < 1e-16 ) && (normRmatrix2/normB2 < 1e-16)) {
+    std::cout<<"Test 3 (KLU with rocSolverRf refactorization) PASSED"<<std::endl;
+  } else {
+
+    std::cout<<"Test 3 (KLU with rocSolverRF refactorization) FAILED, error sum: "<<error_sum<<std::endl;
+  }
+
+  //now DELETE
+  delete A;
+  delete KLU;
+  delete [] x;
+  delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
+  return error_sum;
+}
+

From 219e645692610a150f3038f87b6575c55c31dbf7 Mon Sep 17 00:00:00 2001
From: Slaven Peles <peless@ornl.gov>
Date: Wed, 1 Nov 2023 21:37:40 -0400
Subject: [PATCH 07/12] Apparently we need to add rocsolver library to the list
 of dependencies.

---
 cmake/ReSolveFindHipLibraries.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
index 4cb0c443..d0d22395 100644
--- a/cmake/ReSolveFindHipLibraries.cmake
+++ b/cmake/ReSolveFindHipLibraries.cmake
@@ -12,6 +12,7 @@ target_link_libraries(resolve_hip INTERFACE
   hip::device
   roc::rocblas
   roc::rocsparse
+  rocsolver
 )
 
 install(TARGETS resolve_hip EXPORT ReSolveTargets)

From 05a5b2e7eb06a07981a8344c9aebf80d011d0cbb Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Thu, 2 Nov 2023 11:36:28 -0400
Subject: [PATCH 08/12] A working alternative triangular solver (faster) for
 rocsolverrf (#56)

* a WORKING alternative triangular solver (faster) for rocsolverrf

---------

Co-authored-by: kswirydo <kasia.swirydowicz@gmail.com>
---
 examples/r_KLU_rocSolverRf_FGMRES.cpp  |   3 +-
 resolve/LinSolverDirectRocSolverRf.cpp | 213 ++++++++++++++++++++++++-
 resolve/LinSolverDirectRocSolverRf.hpp |  22 ++-
 resolve/hip/hipKernels.h               |  11 ++
 resolve/hip/hipKernels.hip             |  44 +++++
 resolve/hip/hipVectorKernels.hip       |   1 +
 6 files changed, 287 insertions(+), 7 deletions(-)

diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp
index d2e5f7a6..45fe4681 100644
--- a/examples/r_KLU_rocSolverRf_FGMRES.cpp
+++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp
@@ -131,6 +131,7 @@ int main(int argc, char *argv[])
       std::cout<<"KLU analysis status: "<<status<<std::endl;
       status = KLU->factorize();
       std::cout<<"KLU factorization status: "<<status<<std::endl;
+     
       status = KLU->solve(vec_rhs, vec_x);
       std::cout<<"KLU solve status: "<<status<<std::endl;      
       vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
@@ -149,6 +150,7 @@ int main(int argc, char *argv[])
         if (L == nullptr) {printf("ERROR");}
         index_type* P = KLU->getPOrdering();
         index_type* Q = KLU->getQOrdering();
+        Rf->setSolveMode(1);
         Rf->setup(A, L, U, P, Q, vec_rhs);
         Rf->refactorize();
         std::cout<<"about to set FGMRES" <<std::endl;
@@ -162,7 +164,6 @@ int main(int argc, char *argv[])
       std::cout<<"ROCSOLVER RF refactorization status: "<<status<<std::endl;      
       status = Rf->solve(vec_rhs, vec_x);
       std::cout<<"ROCSOLVER RF solve status: "<<status<<std::endl;      
-
       vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
       norm_b = vector_handler->dot(vec_r, vec_r, "hip");
       norm_b = sqrt(norm_b);
diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp
index 5869756d..f9f73b4a 100644
--- a/resolve/LinSolverDirectRocSolverRf.cpp
+++ b/resolve/LinSolverDirectRocSolverRf.cpp
@@ -1,6 +1,7 @@
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
 #include "LinSolverDirectRocSolverRf.hpp"
+#include <resolve/hip/hipKernels.h>
 
 namespace ReSolve 
 {
@@ -15,6 +16,12 @@ namespace ReSolve
   {
     mem_.deleteOnDevice(d_P_);
     mem_.deleteOnDevice(d_Q_);
+
+    mem_.deleteOnDevice(d_aux1_);
+    mem_.deleteOnDevice(d_aux2_);
+
+    delete L_csr_;
+    delete U_csr_;
   }
 
   int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs)
@@ -56,7 +63,109 @@ namespace ReSolve
     mem_.deviceSynchronize();
     error_sum += status_rocblas_;
 
+    // tri solve setup
+    if (solve_mode_ == 1) { // fast mode
+      L_csr_ = new ReSolve::matrix::Csr(L->getNumRows(), L->getNumColumns(), L->getNnz());
+      U_csr_ = new ReSolve::matrix::Csr(U->getNumRows(), U->getNumColumns(), U->getNnz());
+
+      L_csr_->allocateMatrixData(ReSolve::memory::DEVICE); 
+      U_csr_->allocateMatrixData(ReSolve::memory::DEVICE); 
+
+      rocsparse_create_mat_descr(&(descr_L_));
+      rocsparse_set_mat_fill_mode(descr_L_, rocsparse_fill_mode_lower);
+      rocsparse_set_mat_index_base(descr_L_, rocsparse_index_base_zero);
+
+      rocsparse_create_mat_descr(&(descr_U_));
+      rocsparse_set_mat_index_base(descr_U_, rocsparse_index_base_zero);
+      rocsparse_set_mat_fill_mode(descr_U_, rocsparse_fill_mode_upper);
+
+      rocsparse_create_mat_info(&info_L_);
+      rocsparse_create_mat_info(&info_U_);
+
+      // local variables
+      size_t L_buffer_size;  
+      size_t U_buffer_size;  
+
+      status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(),
+                                                 n,
+                                                 M_->getNnzExpanded(),
+                                                 M_->getRowData(ReSolve::memory::DEVICE), 
+                                                 M_->getColData(ReSolve::memory::DEVICE), 
+                                                 M_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                 L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                 L_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                 L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                 U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                 U_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                 U_csr_->getValues(ReSolve::memory::DEVICE));
+
+      error_sum += status_rocblas_;
+
+      status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), 
+                                                       rocsparse_operation_none, 
+                                                       n, 
+                                                       L_csr_->getNnz(), 
+                                                       descr_L_,
+                                                       L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                       L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                       L_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                       info_L_, 
+                                                       &L_buffer_size);
+      error_sum += status_rocsparse_;
+
+      printf("buffer size for L %d status %d \n", L_buffer_size, status_rocsparse_);
+      // hipMalloc((void**)&(L_buffer), L_buffer_size);
+
+      mem_.allocateBufferOnDevice(&L_buffer_, L_buffer_size);
+      status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), 
+                                                       rocsparse_operation_none, 
+                                                       n, 
+                                                       U_csr_->getNnz(), 
+                                                       descr_U_,
+                                                       U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                       U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                       U_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                       info_U_,
+                                                       &U_buffer_size);
+      error_sum += status_rocsparse_;
+      //      hipMalloc((void**)&(U_buffer), U_buffer_size);
+      mem_.allocateBufferOnDevice(&U_buffer_, U_buffer_size);
+      printf("buffer size for U %d status %d \n", U_buffer_size, status_rocsparse_);
+
+      status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), 
+                                                    rocsparse_operation_none, 
+                                                    n, 
+                                                    L_csr_->getNnz(), 
+                                                    descr_L_,
+                                                    L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                    L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                    L_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                    info_L_,   
+                                                    rocsparse_analysis_policy_force,
+                                                    rocsparse_solve_policy_auto,
+                                                    L_buffer_);
+      error_sum += status_rocsparse_;
+      if (status_rocsparse_!=0)printf("status after analysis 1 %d \n", status_rocsparse_);
+      status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), 
+                                                    rocsparse_operation_none, 
+                                                    n, 
+                                                    U_csr_->getNnz(), 
+                                                    descr_U_,
+                                                    U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                    U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                    U_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                    info_U_,
+                                                    rocsparse_analysis_policy_force,
+                                                    rocsparse_solve_policy_auto,
+                                                    U_buffer_);
+      error_sum += status_rocsparse_;
+      if (status_rocsparse_!=0)printf("status after analysis 2 %d \n", status_rocsparse_);
+      //allocate aux data
+
+      mem_.allocateArrayOnDevice(&d_aux1_,n); 
+      mem_.allocateArrayOnDevice(&d_aux2_,n); 
 
+    }
     return error_sum;
   }
 
@@ -78,15 +187,38 @@ namespace ReSolve
                                                  d_Q_,
                                                  infoM_);
 
+
     mem_.deviceSynchronize();
     error_sum += status_rocblas_;
 
+    if (solve_mode_ == 1) {
+      //split M, fill L and U with correct values
+printf("solve mode 1, splitting the factors again \n");
+      status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(),
+                                                 A_->getNumRows(),
+                                                 M_->getNnzExpanded(),
+                                                 M_->getRowData(ReSolve::memory::DEVICE), 
+                                                 M_->getColData(ReSolve::memory::DEVICE), 
+                                                 M_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                 L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                 L_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                 L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                                                 U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                                                 U_csr_->getColData(ReSolve::memory::DEVICE), 
+                                                 U_csr_->getValues(ReSolve::memory::DEVICE));
+
+      mem_.deviceSynchronize();
+      error_sum += status_rocblas_;
+
+    }
+
     return error_sum; 
   }
 
   // solution is returned in RHS
   int LinSolverDirectRocSolverRf::solve(vector_type* rhs)
   {
+    int error_sum = 0;
     if (solve_mode_ == 0) {
       mem_.deviceSynchronize();
       status_rocblas_ =  rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(),
@@ -104,15 +236,51 @@ namespace ReSolve
       mem_.deviceSynchronize();
     } else {
       // not implemented yet
+      permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_);
+      mem_.deviceSynchronize();
+      rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                             rocsparse_operation_none,
+                             A_->getNumRows(),
+                             L_csr_->getNnz(), 
+                             &(constants::ONE), 
+                             descr_L_,
+                             L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                             L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                             L_csr_->getColData(ReSolve::memory::DEVICE), 
+                             info_L_,
+                             d_aux1_,
+                             d_aux2_, //result
+                             rocsparse_solve_policy_auto,
+                             L_buffer_);
+      error_sum += status_rocsparse_;
+
+      rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                             rocsparse_operation_none,
+                             A_->getNumRows(),
+                             U_csr_->getNnz(), 
+                             &(constants::ONE), 
+                             descr_L_,
+                             U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                             U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                             U_csr_->getColData(ReSolve::memory::DEVICE), 
+                             info_U_,
+                             d_aux2_, //input
+                             d_aux1_,//result
+                             rocsparse_solve_policy_auto,
+                             U_buffer_);
+      error_sum += status_rocsparse_;
+
+      permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,rhs->getData(ReSolve::memory::DEVICE));
+      mem_.deviceSynchronize();
     }
-    return status_rocblas_;
+    return error_sum;
   }
 
   int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x)
   {
     x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE);
     x->setDataUpdated(ReSolve::memory::DEVICE);
-
+    int error_sum = 0;
     if (solve_mode_ == 0) {
       mem_.deviceSynchronize();
       status_rocblas_ =  rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(),
@@ -127,11 +295,50 @@ namespace ReSolve
                                                 x->getData(ReSolve::memory::DEVICE),
                                                 A_->getNumRows(),
                                                 infoM_);
+      error_sum += status_rocblas_;
       mem_.deviceSynchronize();
     } else {
       // not implemented yet
+
+      permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_);
+      mem_.deviceSynchronize();
+
+      rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                             rocsparse_operation_none,
+                             A_->getNumRows(),
+                             L_csr_->getNnz(), 
+                             &(constants::ONE), 
+                             descr_L_,
+                             L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                             L_csr_->getRowData(ReSolve::memory::DEVICE), 
+                             L_csr_->getColData(ReSolve::memory::DEVICE), 
+                             info_L_,
+                             d_aux1_,
+                             d_aux2_, //result
+                             rocsparse_solve_policy_auto,
+                             L_buffer_);
+      error_sum += status_rocsparse_;
+
+      rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), 
+                             rocsparse_operation_none,
+                             A_->getNumRows(),
+                             U_csr_->getNnz(), 
+                             &(constants::ONE), 
+                             descr_U_,
+                             U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, 
+                             U_csr_->getRowData(ReSolve::memory::DEVICE), 
+                             U_csr_->getColData(ReSolve::memory::DEVICE), 
+                             info_U_,
+                             d_aux2_, //input
+                             d_aux1_,//result
+                             rocsparse_solve_policy_auto,
+                             U_buffer_);
+      error_sum += status_rocsparse_;
+
+      permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,x->getData(ReSolve::memory::DEVICE));
+      mem_.deviceSynchronize();
     }
-    return status_rocblas_;
+    return error_sum;
   }
 
   int LinSolverDirectRocSolverRf::setSolveMode(int mode)
diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp
index 5804393f..eb3a11a6 100644
--- a/resolve/LinSolverDirectRocSolverRf.hpp
+++ b/resolve/LinSolverDirectRocSolverRf.hpp
@@ -42,8 +42,8 @@ namespace ReSolve
       int getSolveMode(); //should be enum too
 
     private:
-      rocblas_status status_rocblas_;
-      
+      rocblas_status status_rocblas_; 
+      rocsparse_status status_rocsparse_;
       index_type* d_P_;
       index_type* d_Q_;
 
@@ -54,6 +54,22 @@ namespace ReSolve
       void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors
       rocsolver_rfinfo infoM_;
       matrix::Sparse* M_;//the matrix that contains added factors
-      int solve_mode_;
+      int solve_mode_; // 0 is default and 1 is fast
+
+      // not used by default - for fast solve
+      rocsparse_mat_descr descr_L_{nullptr};
+      rocsparse_mat_descr descr_U_{nullptr};
+
+      rocsparse_mat_info  info_L_{nullptr};
+      rocsparse_mat_info  info_U_{nullptr};
+
+      void* L_buffer_{nullptr};
+      void* U_buffer_{nullptr};
+
+      ReSolve::matrix::Csr* L_csr_;
+      ReSolve::matrix::Csr* U_csr_;
+      
+      real_type* d_aux1_{nullptr};
+      real_type* d_aux2_{nullptr};
   };
 }
diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h
index 9c48783a..986efc84 100644
--- a/resolve/hip/hipKernels.h
+++ b/resolve/hip/hipKernels.h
@@ -12,3 +12,14 @@ void matrix_row_sums(int n,
                      int* a_ia,
                      double* a_val, 
                      double* result);
+
+// needed for triangular solve
+
+void permuteVectorP(int n, 
+                    int* perm_vector,
+                    double* vec_in, 
+                    double* vec_out);
+void permuteVectorQ(int n, 
+                    int* perm_vector,
+                    double* vec_in, 
+                    double* vec_out);
diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip
index 13f53d85..abad5b39 100644
--- a/resolve/hip/hipKernels.hip
+++ b/resolve/hip/hipKernels.hip
@@ -143,6 +143,34 @@ __global__ void matrixInfNormPart1(const int n,
 }
 
 
+__global__ void permuteVectorP_kernel(const int n, 
+                                      const int* perm_vector,
+                                      const double* vec_in, 
+                                      double* vec_out){
+
+  //one thread per vector entry, pass through rows
+
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  while (idx<n){
+    vec_out[idx] = vec_in[perm_vector[idx]];
+    idx+= (blockDim.x*gridDim.x);
+  }
+}
+
+__global__ void permuteVectorQ_kernel(const int n, 
+                                      const int* perm_vector,
+                                      const double* vec_in, 
+                                      double* vec_out){
+
+  //one thread per vector entry, pass through rows
+
+  int idx = blockIdx.x*blockDim.x + threadIdx.x;
+  while (idx<n){
+    vec_out[perm_vector[idx]] = vec_in[idx];
+    idx+= (blockDim.x*gridDim.x);
+  }
+}
+
 void mass_inner_product_two_vectors(int n, 
                                     int i, 
                                     double* vec1, 
@@ -165,3 +193,19 @@ void matrix_row_sums(int n,
 {
   hipLaunchKernelGGL(matrixInfNormPart1,dim3(1000),dim3(1024), 0, 0, n, nnz, a_ia, a_val, result);
 }
+
+void permuteVectorP(int n, 
+                    int* perm_vector,
+                    double* vec_in, 
+                    double* vec_out)
+{
+  hipLaunchKernelGGL(permuteVectorP_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
+}
+
+void permuteVectorQ(int n, 
+                    int* perm_vector,
+                    double* vec_in, 
+                    double* vec_out)
+{
+  hipLaunchKernelGGL(permuteVectorQ_kernel,dim3(1000), dim3(1024), 0, 0,n, perm_vector,vec_in, vec_out);
+}
diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip
index f68cd0b9..5b3ace30 100644
--- a/resolve/hip/hipVectorKernels.hip
+++ b/resolve/hip/hipVectorKernels.hip
@@ -25,4 +25,5 @@ void set_array_const(index_type  n, real_type val, real_type* arr)
    hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr);
 }
 
+
 }} // namespace ReSolve::vector

From 1a6f9c17ec57c0e00ecb3da035a454c1afd1dfca Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Thu, 2 Nov 2023 14:36:24 -0400
Subject: [PATCH 09/12] working rocsolver FGMRES test (#58)

Co-authored-by: kswirydo <kasia.swirydowicz@gmail.com>
---
 tests/functionality/CMakeLists.txt            |   8 +-
 .../testKLU_RocSolver_FGMRES.cpp              | 271 ++++++++++++++++++
 2 files changed, 278 insertions(+), 1 deletion(-)
 create mode 100644 tests/functionality/testKLU_RocSolver_FGMRES.cpp

diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt
index acc5ce60..85b47fd7 100644
--- a/tests/functionality/CMakeLists.txt
+++ b/tests/functionality/CMakeLists.txt
@@ -33,6 +33,10 @@ if(RESOLVE_USE_HIP)
   add_executable(rocsolver_rf_test.exe testKLU_RocSolver.cpp)
   target_link_libraries(rocsolver_rf_test.exe PRIVATE ReSolve)
 
+  # And another one to test FGMRES version
+  add_executable(rocsolver_rf_fgmres_test.exe testKLU_RocSolver_FGMRES.cpp)
+  target_link_libraries(rocsolver_rf_fgmres_test.exe PRIVATE ReSolve)
+
 endif(RESOLVE_USE_HIP)
 
 # Install tests
@@ -47,7 +51,8 @@ endif(RESOLVE_USE_CUDA)
 
 if(RESOLVE_USE_HIP)
   set(installable_tests ${installable_tests}
-                        rocsolver_rf_test.exe)
+                        rocsolver_rf_test.exe
+                        rocsolver_rf_fgmres_test.exe)
 endif(RESOLVE_USE_HIP)
 
 install(TARGETS ${installable_tests} 
@@ -67,4 +72,5 @@ endif(RESOLVE_USE_CUDA)
 
 if(RESOLVE_USE_HIP)
   add_test(NAME rocsolver_rf_test  COMMAND $<TARGET_FILE:rocsolver_rf_test.exe>  "${test_data_dir}")
+  add_test(NAME rocsolver_rf_fgmres_test  COMMAND $<TARGET_FILE:rocsolver_rf_fgmres_test.exe>  "${test_data_dir}")
 endif(RESOLVE_USE_HIP)
diff --git a/tests/functionality/testKLU_RocSolver_FGMRES.cpp b/tests/functionality/testKLU_RocSolver_FGMRES.cpp
new file mode 100644
index 00000000..a544eb54
--- /dev/null
+++ b/tests/functionality/testKLU_RocSolver_FGMRES.cpp
@@ -0,0 +1,271 @@
+#include <string>
+#include <iostream>
+#include <iomanip>
+
+#include <resolve/vector/Vector.hpp>
+#include <resolve/matrix/io.hpp>
+#include <resolve/matrix/Coo.hpp>
+#include <resolve/matrix/Csr.hpp>
+#include <resolve/matrix/Csc.hpp>
+#include <resolve/matrix/MatrixHandler.hpp>
+#include <resolve/vector/VectorHandler.hpp>
+#include <resolve/LinSolverDirectKLU.hpp>
+#include <resolve/LinSolverDirectRocSolverRf.hpp>
+#include <resolve/LinSolverIterativeFGMRES.hpp>
+#include <resolve/workspace/LinAlgWorkspace.hpp>
+//author: KS
+//functionality test to check whether cuSolverRf/FGMRES works correctly.
+
+using namespace ReSolve::constants;
+
+int main(int argc, char *argv[])
+{
+  // Use ReSolve data types.
+  using index_type = ReSolve::index_type;
+  using real_type  = ReSolve::real_type;
+  using vector_type = ReSolve::vector::Vector;
+
+  //we want error sum to be 0 at the end
+  //that means PASS.
+  //otheriwse it is a FAIL.
+  int error_sum = 0;
+  int status = 0;
+
+  ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP();
+  workspace_HIP->initializeHandles();
+  ReSolve::MatrixHandler* matrix_handler =  new ReSolve::MatrixHandler(workspace_HIP);
+  ReSolve::VectorHandler* vector_handler =  new ReSolve::VectorHandler(workspace_HIP);
+
+  ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU;
+  KLU->setupParameters(1, 0.1, false);
+
+  ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP);
+  ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2);
+  ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip");
+  // Input to this code is location of `data` directory where matrix files are stored
+  const std::string data_path = (argc == 2) ? argv[1] : "./";
+
+
+  std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg2000_AC_00.mtx";
+  std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg2000_AC_02.mtx";
+
+  std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg2000_AC_00.mtx.ones";
+  std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg2000_AC_02.mtx.ones";
+
+
+
+  // Read first matrix
+  std::ifstream mat1(matrixFileName1);
+  if(!mat1.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName1 << "\n";
+    return -1;
+  }
+  ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1);
+  ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(),
+                                                     A_coo->getNumColumns(),
+                                                     A_coo->getNnz(),
+                                                     A_coo->symmetric(),
+                                                     A_coo->expanded());
+  mat1.close();
+
+  // Read first rhs vector
+  std::ifstream rhs1_file(rhsFileName1);
+  if(!rhs1_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName1 << "\n";
+    return -1;
+  }
+  real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file);
+  real_type* x   = new real_type[A->getNumRows()];
+  vector_type* vec_rhs = new vector_type(A->getNumRows());
+  vector_type* vec_x   = new vector_type(A->getNumRows());
+  vector_type* vec_r   = new vector_type(A->getNumRows());
+  rhs1_file.close();
+
+  // Convert first matrix to CSR format
+  matrix_handler->coo2csr(A_coo, A, "cpu");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+  vec_rhs->setDataUpdated(ReSolve::memory::HOST);
+
+  // Solve the first system using KLU
+  status = KLU->setup(A);
+  error_sum += status;
+
+  status = KLU->analyze();
+  error_sum += status;
+
+  status = KLU->factorize();
+  error_sum += status;
+
+  status = KLU->solve(vec_rhs, vec_x);
+  error_sum += status;
+
+  vector_type* vec_test;
+  vector_type* vec_diff;
+
+  vec_test  = new vector_type(A->getNumRows());
+  vec_diff  = new vector_type(A->getNumRows());
+  real_type* x_data = new real_type[A->getNumRows()];
+
+  for (int i=0; i<A->getNumRows(); ++i){
+    x_data[i] = 1.0;
+  }
+
+  vec_test->setData(x_data, ReSolve::memory::HOST);
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+
+  // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE));
+  matrix_handler->setValuesChanged(true, "hip");
+  //evaluate the residual ||b-Ax||
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); 
+  error_sum += status;
+
+  real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+
+  //for testing only - control
+
+  real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip"));
+  real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip"));
+
+  //compute x-x_true
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip");
+  //evaluate its norm
+  real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip"));
+
+  //compute the residual using exact solution
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
+  error_sum += status;
+  real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+  //evaluate the residual ON THE CPU using COMPUTED solution
+
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST);
+
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu");
+  error_sum += status;
+
+  real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+  std::cout<<"Results (first matrix): "<<std::endl<<std::endl;
+  std::cout<<"\t ||b-A*x||_2                 : " << std::setprecision(16) << normRmatrix1    << " (residual norm)" << std::endl;
+  std::cout<<"\t ||b-A*x||_2  (CPU)          : " << std::setprecision(16) << normRmatrix1CPU << " (residual norm)" << std::endl;
+  std::cout<<"\t ||b-A*x||_2/||b||_2         : " << normRmatrix1/normB1   << " (scaled residual norm)"             << std::endl;
+  std::cout<<"\t ||x-x_true||_2              : " << normDiffMatrix1       << " (solution error)"                   << std::endl;
+  std::cout<<"\t ||x-x_true||_2/||x_true||_2 : " << normDiffMatrix1/normXtrue << " (scaled solution error)"        << std::endl;
+  std::cout<<"\t ||b-A*x_exact||_2           : " << exactSol_normRmatrix1 << " (control; residual norm with exact solution)\n\n";
+
+
+  // Now prepare the Rf solver
+
+  ReSolve::matrix::Csc* L = (ReSolve::matrix::Csc*) KLU->getLFactor();
+  ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor();
+
+  if (L == nullptr) {
+    printf("ERROR");
+  }
+  index_type* P = KLU->getPOrdering();
+  index_type* Q = KLU->getQOrdering();
+  Rf->setSolveMode(1);
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  error_sum += Rf->setup(A, L, U, P, Q, vec_rhs); 
+  FGMRES->setMaxit(200); 
+  FGMRES->setRestart(100); 
+
+  GS->setup(A->getNumRows(), FGMRES->getRestart()); 
+  status =  FGMRES->setup(A); 
+  error_sum += status;
+
+  // Load the second matrix
+  std::ifstream mat2(matrixFileName2);
+  if(!mat2.is_open())
+  {
+    std::cout << "Failed to open file " << matrixFileName2 << "\n";
+    return -1;
+  }
+  ReSolve::io::readAndUpdateMatrix(mat2, A_coo);
+  mat2.close();
+
+  // Load the second rhs vector
+  std::ifstream rhs2_file(rhsFileName2);
+  if(!rhs2_file.is_open())
+  {
+    std::cout << "Failed to open file " << rhsFileName2 << "\n";
+    return -1;
+  }
+  ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs);
+  rhs2_file.close();
+
+  matrix_handler->coo2csr(A_coo, A, "hip");
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+
+  status = Rf->refactorize();
+  error_sum += status;
+  
+  vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = Rf->solve(vec_x);
+  error_sum += status;
+  
+  FGMRES->resetMatrix(A);
+  status = FGMRES->setupPreconditioner("LU", Rf);
+  error_sum += status;
+
+  vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = FGMRES->solve(vec_rhs, vec_x);
+  error_sum += status;
+
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  matrix_handler->setValuesChanged(true, "hip");
+
+  //evaluate final residual
+  status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); 
+  error_sum += status;
+
+  real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+
+
+  //for testing only - control
+  real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip"));
+  //compute x-x_true
+  vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip");
+  //evaluate its norm
+  real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip"));
+
+  //compute the residual using exact solution
+  vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
+  status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); 
+  error_sum += status;
+  real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip"));
+  std::cout<<"Results (second matrix): "<<std::endl<<std::endl;
+  std::cout<<"\t ||b-A*x||_2                 : "<<normRmatrix2<<" (residual norm)"<<std::endl;
+  std::cout<<"\t ||b-A*x||_2/||b||_2         : "<<normRmatrix2/normB2<<" (scaled residual norm)"<<std::endl;
+  std::cout<<"\t ||x-x_true||_2              : "<<normDiffMatrix2<<" (solution error)"<<std::endl;
+  std::cout<<"\t ||x-x_true||_2/||x_true||_2 : "<<normDiffMatrix2/normXtrue<<" (scaled solution error)"<<std::endl;
+  std::cout<<"\t ||b-A*x_exact||_2           : "<<exactSol_normRmatrix2<<" (control; residual norm with exact solution)"<<std::endl;
+  std::cout<<"\t IR iterations               : "<<FGMRES->getNumIter()<<" (max 200, restart 100)"<<std::endl;
+  std::cout<<"\t IR starting res. norm       : "<<FGMRES->getInitResidualNorm()<<" "<<std::endl;
+  std::cout<<"\t IR final res. norm          : "<<FGMRES->getFinalResidualNorm()<<" (tol 1e-14)"<<std::endl<<std::endl;
+  if ((error_sum == 0) && (normRmatrix1/normB1 < 1e-12 ) && (normRmatrix2/normB2 < 1e-9)) {
+    std::cout<<"Test 4 (KLU with rocsolverrf refactorization + IR) PASSED"<<std::endl<<std::endl;;
+  } else {
+    std::cout<<"Test 4 (KLU with rocsolverrf refactorization + IR) FAILED, error sum: "<<error_sum<<std::endl<<std::endl;;
+  }
+
+  delete A;
+  delete KLU;
+  delete GS;
+  delete FGMRES;
+  delete Rf;
+  delete [] x;
+  delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
+
+  return error_sum;
+}

From 1ed777fad7290fa117397c8160b624665cd375e3 Mon Sep 17 00:00:00 2001
From: pelesh <peless@ornl.gov>
Date: Thu, 2 Nov 2023 21:15:19 -0400
Subject: [PATCH 10/12] Fix warnings in solver classes. (#62)

---
 examples/r_KLU_rocSolverRf_FGMRES.cpp  | 17 +++++++++++------
 examples/r_KLU_rocsolverrf.cpp         |  8 +-------
 resolve/LinSolver.cpp                  | 22 ++++++++++++++++------
 resolve/LinSolver.hpp                  |  9 ++++++++-
 resolve/LinSolverDirectCuSolverGLU.cpp |  9 ++++++++-
 resolve/LinSolverDirectCuSolverGLU.hpp |  7 ++++++-
 resolve/LinSolverDirectCuSolverRf.cpp  |  7 ++++++-
 resolve/LinSolverDirectCuSolverRf.hpp  |  7 ++++++-
 resolve/LinSolverDirectKLU.cpp         |  7 ++++++-
 resolve/LinSolverDirectKLU.hpp         |  8 +++++++-
 resolve/LinSolverDirectRocSolverRf.cpp | 22 +++++++++++-----------
 resolve/LinSolverDirectRocSolverRf.hpp |  7 ++++++-
 12 files changed, 92 insertions(+), 38 deletions(-)

diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp
index 45fe4681..32d1865f 100644
--- a/examples/r_KLU_rocSolverRf_FGMRES.cpp
+++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp
@@ -141,12 +141,8 @@ int main(int argc, char *argv[])
       matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); 
       printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "hip"))/norm_b);
       if (i == 1) {
-        ReSolve::matrix::Csc* L /* _csc */ = (ReSolve::matrix::Csc*) KLU->getLFactor();
-        ReSolve::matrix::Csc* U /* _csc */ = (ReSolve::matrix::Csc*) KLU->getUFactor();
-        // ReSolve::matrix::Csr* L = new ReSolve::matrix::Csr(L_csc->getNumRows(), L_csc->getNumColumns(), L_csc->getNnz());
-        // ReSolve::matrix::Csr* U = new ReSolve::matrix::Csr(U_csc->getNumRows(), U_csc->getNumColumns(), U_csc->getNnz());
-        // matrix_handler->csc2csr(L_csc,L, "hip");
-        // matrix_handler->csc2csr(U_csc,U, "hip");
+        ReSolve::matrix::Csc* L = (ReSolve::matrix::Csc*) KLU->getLFactor();
+        ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor();
         if (L == nullptr) {printf("ERROR");}
         index_type* P = KLU->getPOrdering();
         index_type* Q = KLU->getQOrdering();
@@ -193,8 +189,17 @@ int main(int argc, char *argv[])
 
   } // for (int i = 0; i < numSystems; ++i)
 
+  delete A;
+  delete A_coo;
+  delete KLU;
+  delete Rf;
   delete [] x;
   delete [] rhs;
+  delete vec_r;
+  delete vec_x;
+  delete workspace_HIP;
+  delete matrix_handler;
+  delete vector_handler;
 
   return 0;
 }
diff --git a/examples/r_KLU_rocsolverrf.cpp b/examples/r_KLU_rocsolverrf.cpp
index b3ebbecf..5651ed56 100644
--- a/examples/r_KLU_rocsolverrf.cpp
+++ b/examples/r_KLU_rocsolverrf.cpp
@@ -135,20 +135,13 @@ int main(int argc, char *argv[] )
         vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
         Rf->setup(A, L, U, P, Q, vec_rhs); 
         Rf->refactorize();
-       //dont do it here 
-      //  delete [] P;
-      //  delete [] Q;
       }
     } else {
-      //status =  KLU->refactorize();
       std::cout<<"Using rocsolver rf"<<std::endl;
       status = Rf->refactorize();
       std::cout<<"rocsolver rf refactorization status: "<<status<<std::endl;      
       status = Rf->solve(vec_rhs, vec_x);
       std::cout<<"rocsolver rf solve status: "<<status<<std::endl;      
-      //std::cout<<"KLU re-factorization status: "<<status<<std::endl;
-      //status = KLU->solve(vec_rhs, vec_x);
-      //std::cout<<"KLU solve status: "<<status<<std::endl;      
     }
     vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE);
 
@@ -164,6 +157,7 @@ int main(int argc, char *argv[] )
 
   //now DELETE
   delete A;
+  delete A_coo;
   delete KLU;
   delete Rf;
   delete [] x;
diff --git a/resolve/LinSolver.cpp b/resolve/LinSolver.cpp
index 558a6500..5682ec40 100644
--- a/resolve/LinSolver.cpp
+++ b/resolve/LinSolver.cpp
@@ -13,12 +13,6 @@ namespace ReSolve
     //destroy the matrix and hadlers
   }
 
-  int LinSolver::setup(matrix::Sparse* A)
-  {
-    this->A_ = A;
-    return 0;
-  }
-
   real_type LinSolver::evaluateResidual()
   {
     //to be implemented
@@ -42,6 +36,17 @@ namespace ReSolve
     delete [] Q_;
   }
 
+  int LinSolverDirect::setup(matrix::Sparse* A,
+                             matrix::Sparse* /* L */,
+                             matrix::Sparse* /* U */,
+                             index_type*     /* P */,
+                             index_type*     /* Q */,
+                             vector_type*  /* rhs */)
+  {
+    this->A_ = A;
+    return 0;
+  }
+
   int LinSolverDirect::analyze()
   {
     return 0;
@@ -92,6 +97,11 @@ namespace ReSolve
   {
   }
 
+  int LinSolverIterative::setup(matrix::Sparse* A)
+  {
+    this->A_ = A;
+    return 0;
+  }
 
   int LinSolverIterative::solve(vector_type* /* rhs */, vector_type* /* init_guess */)
   {
diff --git a/resolve/LinSolver.hpp b/resolve/LinSolver.hpp
index 8c9ca5c9..a34aeba0 100644
--- a/resolve/LinSolver.hpp
+++ b/resolve/LinSolver.hpp
@@ -31,7 +31,6 @@ namespace ReSolve
       LinSolver();
       virtual ~LinSolver();
 
-      virtual int setup(matrix::Sparse* A);
       real_type evaluateResidual();
         
     protected:  
@@ -49,6 +48,13 @@ namespace ReSolve
       LinSolverDirect();
       virtual ~LinSolverDirect();
       //return 0 if successful!
+      virtual int setup(matrix::Sparse* A,
+                        matrix::Sparse* L,
+                        matrix::Sparse* U,
+                        index_type*     P,
+                        index_type*     Q,
+                        vector_type*  rhs);
+                        
       virtual int analyze(); //the same as symbolic factorization
       virtual int factorize();
       virtual int refactorize();
@@ -72,6 +78,7 @@ namespace ReSolve
     public:
       LinSolverIterative();
       ~LinSolverIterative();
+      virtual int setup(matrix::Sparse* A);
 
       virtual int  solve(vector_type* rhs, vector_type* init_guess);
   };
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index 0350efea..65af5812 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -8,6 +8,8 @@
 
 namespace ReSolve
 {
+  using vector_type = vector::Vector;
+
   LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace)
   {
     this->workspace_ = workspace;
@@ -22,7 +24,12 @@ namespace ReSolve
     delete M_;
   }
 
-  int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q)
+  int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A,
+                                        matrix::Sparse* L,
+                                        matrix::Sparse* U,
+                                        index_type* P,
+                                        index_type* Q,
+                                        vector_type* /* rhs */)
   {
     int error_sum = 0;
 
diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp
index a48c8cba..899f52e3 100644
--- a/resolve/LinSolverDirectCuSolverGLU.hpp
+++ b/resolve/LinSolverDirectCuSolverGLU.hpp
@@ -32,7 +32,12 @@ namespace ReSolve
       int refactorize();
       int solve(vector_type* rhs, vector_type* x);
 
-      int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q);
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L,
+                matrix::Sparse* U,
+                index_type*     P,
+                index_type*     Q,
+                vector_type* rhs = nullptr);
     
     private:
       void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors
diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp
index 37a3ffda..905a0e6e 100644
--- a/resolve/LinSolverDirectCuSolverRf.cpp
+++ b/resolve/LinSolverDirectCuSolverRf.cpp
@@ -17,7 +17,12 @@ namespace ReSolve
     mem_.deleteOnDevice(d_T_);
   }
 
-  int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q)
+  int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A,
+                                       matrix::Sparse* L,
+                                       matrix::Sparse* U,
+                                       index_type* P,
+                                       index_type* Q,
+                                       vector_type* /* rhs */)
   {
     //remember - P and Q are generally CPU variables
     int error_sum = 0;
diff --git a/resolve/LinSolverDirectCuSolverRf.hpp b/resolve/LinSolverDirectCuSolverRf.hpp
index f0ee755e..77e8b94f 100644
--- a/resolve/LinSolverDirectCuSolverRf.hpp
+++ b/resolve/LinSolverDirectCuSolverRf.hpp
@@ -26,7 +26,12 @@ namespace ReSolve
       LinSolverDirectCuSolverRf();
       ~LinSolverDirectCuSolverRf();
       
-      int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q);
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L,
+                matrix::Sparse* U,
+                index_type*     P,
+                index_type*     Q,
+                vector_type* rhs = nullptr);
 
       void setAlgorithms(cusolverRfFactorization_t fact_alg,  cusolverRfTriangularSolve_t solve_alg);
       
diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp
index 6af27d10..6336e9e9 100644
--- a/resolve/LinSolverDirectKLU.cpp
+++ b/resolve/LinSolverDirectKLU.cpp
@@ -18,7 +18,12 @@ namespace ReSolve
     klu_free_numeric(&Numeric_, &Common_);
   }
 
-  int LinSolverDirectKLU::setup(matrix::Sparse* A)
+  int LinSolverDirectKLU::setup(matrix::Sparse* A,
+                                matrix::Sparse* /* L */,
+                                matrix::Sparse* /* U */,
+                                index_type*     /* P */,
+                                index_type*     /* Q */,    
+                                vector_type*  /* rhs */)
   {
     this->A_ = A;
     return 0;
diff --git a/resolve/LinSolverDirectKLU.hpp b/resolve/LinSolverDirectKLU.hpp
index 13e27b47..b4edadb1 100644
--- a/resolve/LinSolverDirectKLU.hpp
+++ b/resolve/LinSolverDirectKLU.hpp
@@ -24,7 +24,13 @@ namespace ReSolve
     public:
       LinSolverDirectKLU();
       ~LinSolverDirectKLU();
-      int setup(matrix::Sparse* A);
+
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L = nullptr,
+                matrix::Sparse* U = nullptr,
+                index_type*     P = nullptr,
+                index_type*     Q = nullptr,
+                vector_type*  rhs = nullptr);
      
       void setupParameters(int ordering, double KLU_threshold, bool halt_if_singular);
 
diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp
index f9f73b4a..96d1da79 100644
--- a/resolve/LinSolverDirectRocSolverRf.cpp
+++ b/resolve/LinSolverDirectRocSolverRf.cpp
@@ -24,7 +24,12 @@ namespace ReSolve
     delete U_csr_;
   }
 
-  int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs)
+  int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A,
+                                        matrix::Sparse* L,
+                                        matrix::Sparse* U,
+                                        index_type* P,
+                                        index_type* Q,
+                                        vector_type* rhs)
   {
     //remember - P and Q are generally CPU variables
     int error_sum = 0;
@@ -113,9 +118,6 @@ namespace ReSolve
                                                        &L_buffer_size);
       error_sum += status_rocsparse_;
 
-      printf("buffer size for L %d status %d \n", L_buffer_size, status_rocsparse_);
-      // hipMalloc((void**)&(L_buffer), L_buffer_size);
-
       mem_.allocateBufferOnDevice(&L_buffer_, L_buffer_size);
       status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), 
                                                        rocsparse_operation_none, 
@@ -128,9 +130,7 @@ namespace ReSolve
                                                        info_U_,
                                                        &U_buffer_size);
       error_sum += status_rocsparse_;
-      //      hipMalloc((void**)&(U_buffer), U_buffer_size);
       mem_.allocateBufferOnDevice(&U_buffer_, U_buffer_size);
-      printf("buffer size for U %d status %d \n", U_buffer_size, status_rocsparse_);
 
       status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), 
                                                     rocsparse_operation_none, 
@@ -389,22 +389,22 @@ printf("solve mode 1, splitting the factors again \n");
       mia[i] += mia[i - 1];
     }
 
-    std::vector<int> Mshifts(n, 0);
+    std::vector<int> Mshifts(static_cast<size_t>(n), 0);
     for(index_type i = 0; i < n; ++i) {
       // go through EACH COLUMN OF L first
       for(int j = Lp[i]; j < Lp[i + 1]; ++j) {
         row = Li[j];
         if(row != i) {
           // place (row, i) where it belongs!
-          mja[mia[row] + Mshifts[row]] = i;
-          Mshifts[row]++;
+          mja[mia[row] + Mshifts[static_cast<size_t>(row)]] = i;
+          Mshifts[static_cast<size_t>(row)]++;
         }
       }
       // each column of U next
       for(index_type j = Up[i]; j < Up[i + 1]; ++j) {
         row = Ui[j];
-        mja[mia[row] + Mshifts[row]] = i;
-        Mshifts[row]++;
+        mja[mia[row] + Mshifts[static_cast<size_t>(row)]] = i;
+        Mshifts[static_cast<size_t>(row)]++;
       }
     }
     //Mshifts.~vector(); 
diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp
index eb3a11a6..bb623fb2 100644
--- a/resolve/LinSolverDirectRocSolverRf.hpp
+++ b/resolve/LinSolverDirectRocSolverRf.hpp
@@ -32,7 +32,12 @@ namespace ReSolve
       LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace);
       ~LinSolverDirectRocSolverRf();
       
-      int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q, vector_type* rhs);
+      int setup(matrix::Sparse* A,
+                matrix::Sparse* L,
+                matrix::Sparse* U,
+                index_type*     P,
+                index_type*     Q,
+                vector_type*  rhs);
        
       int refactorize();
       int solve(vector_type* rhs, vector_type* x);

From 4b3bcf5629346a7da457aec75cbbd4687f15e801 Mon Sep 17 00:00:00 2001
From: Cameron Rutherford <cameron.rutherford@me.com>
Date: Thu, 2 Nov 2023 21:52:26 -0400
Subject: [PATCH 11/12] Incline rocm 5.6.0 dev (#53)

* Add working incline build

* Update spack submodule with rocsolver.

* Update spack and add rocsolver/roctracer to CMake.

* Remove several blanket include_directories.

* Fix exported CMake bug in CXX_STANDARD

* Update examples/resolve_consumer/CMakeLists.txt

---------

Co-authored-by: Ryan Danehy <dane678@deception01.pnl.gov>
Co-authored-by: pelesh <peless@ornl.gov>
---
 .gitlab/pnnl/base.gitlab-ci.yml               |   2 +-
 .gitlab/pnnl/incline.gitlab-ci.yml            |   6 +-
 CMakeLists.txt                                |  13 --
 CMakePresets.json                             |  28 ++-
 buildsystem/incline-env.sh                    |  12 +-
 buildsystem/spack/incline/env.sh              |   7 +-
 buildsystem/spack/incline/install.sh          |   5 +
 .../spack/incline/modules/dependencies.sh     | 204 ++++++++++++++++--
 buildsystem/spack/incline/spack.yaml          |  70 ++++--
 buildsystem/spack/spack                       |   2 +-
 cmake/ReSolveConfig.cmake.in                  |  15 +-
 cmake/ReSolveFindHipLibraries.cmake           |  11 +-
 examples/CMakeLists.txt                       |   3 +
 resolve/CMakeLists.txt                        |   1 +
 resolve/LinSolverDirectRocSolverRf.hpp        |   1 -
 resolve/cpu/CMakeLists.txt                    |   5 -
 resolve/cuda/CMakeLists.txt                   |   5 +-
 resolve/hip/CMakeLists.txt                    |  10 +-
 resolve/utilities/logger/CMakeLists.txt       |   5 +-
 resolve/workspace/CMakeLists.txt              |   7 +-
 20 files changed, 328 insertions(+), 84 deletions(-)

diff --git a/.gitlab/pnnl/base.gitlab-ci.yml b/.gitlab/pnnl/base.gitlab-ci.yml
index 092c8f19..4b5954cf 100644
--- a/.gitlab/pnnl/base.gitlab-ci.yml
+++ b/.gitlab/pnnl/base.gitlab-ci.yml
@@ -269,4 +269,4 @@ stages:
   variables:
     WORKDIR_SUFFIX: "x86_64-clang-hip-build"
     MY_CLUSTER: "incline"
-    SLURM_ARGS: " --exclusive --ntasks=3 "
+    SLURM_ARGS: " -N 1 --ntasks=3 "
diff --git a/.gitlab/pnnl/incline.gitlab-ci.yml b/.gitlab/pnnl/incline.gitlab-ci.yml
index afc4fd05..f62b3ad0 100644
--- a/.gitlab/pnnl/incline.gitlab-ci.yml
+++ b/.gitlab/pnnl/incline.gitlab-ci.yml
@@ -3,15 +3,15 @@ Incline Build:
     - .cluster_build
     - .incline
   variables:
-    SCRIPT_ARGS: " --build-only " #--job=clang-hip "
+    SCRIPT_ARGS: " --build-only "
 
 Incline Test:
   extends:
     - .cluster_test
     - .incline
   variables:
-    SCRIPT_ARGS: " --test-only " #--job=clang-hip "
-    CTESTARGS: " --timeout 240 --output-on-failure -LE incline-skip "
+    SCRIPT_ARGS: " --test-only "
+    CTESTARGS: " --timeout 240 --output-on-failure "
   needs: ['Incline Build']
 
 pending:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index db4e8e74..cd99f931 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -85,12 +85,6 @@ if(RESOLVE_USE_HIP)
   enable_language(HIP)
   check_language(HIP)
   include(ReSolveFindHipLibraries)
-
-  # This is just an agly hack to make HIP build work
-  get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
-  message(STATUS "HIP include directories: ${hip_includes}")
-  # TODO - use targets properly
-  include_directories(${hip_includes})
 else()
   message(STATUS "Not using HIP")
 endif(RESOLVE_USE_HIP)
@@ -100,18 +94,11 @@ endif(RESOLVE_USE_HIP)
 configure_file(
   ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in
   ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp)
-
-# include build directory for Fortran name mangling header
-# TODO - target based includes
-include_directories(${CMAKE_BINARY_DIR})
-
 install(
   FILES ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp
   DESTINATION include/resolve
   )
 
-# TODO - fix this
-include_directories(${CMAKE_SOURCE_DIR})
 
 # Enable testing
 enable_testing()
diff --git a/CMakePresets.json b/CMakePresets.json
index c00f9919..4809aca5 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -17,6 +17,17 @@
                 "RESOLVE_USE_CUDA": "ON"
             }
         },
+        {
+            "name": "rocm",
+            "displayName": "ROCM  build",
+            "description": "Base config to build with ROCM",
+            "binaryDir": "${sourceDir}/build",
+            "installDir": "${sourceDir}/install",
+            "generator": "Unix Makefiles",
+            "cacheVariables": {
+                "RESOLVE_USE_HIP": "ON"
+            }
+        },
         {
             "name": "cpu",
             "displayName": "CPU only build",
@@ -25,7 +36,7 @@
             "installDir": "${sourceDir}/install",
             "generator": "Unix Makefiles"
         },
-	      {
+	    {
             "name": "ascent",
             "inherits": "cuda",
             "displayName": "Ascent Build",
@@ -43,9 +54,18 @@
         },
         {
             "name": "incline",
-            "inherits": "cpu",
-            "displayName": "Incline CPU only Build",
-            "description": "Custom changes specific for Incline"
+            "inherits": "rocm",
+            "displayName": "Incline Build with rocm",
+            "description": "Custom changes specific for Incline",
+            "cacheVariables": {
+                "CMAKE_HIP_ARCHITECTURES" : "gfx908",
+                "CMAKE_BUILD_TYPE" : "Debug"
+            },
+            "environment": {
+                "CC" : "clang",
+                "CXX" : "clang++",
+                "FC" : "gfortran"
+            }
         }
 
     ]
diff --git a/buildsystem/incline-env.sh b/buildsystem/incline-env.sh
index 3c4e2194..348139ff 100644
--- a/buildsystem/incline-env.sh
+++ b/buildsystem/incline-env.sh
@@ -1,5 +1,15 @@
+#!/bin/bash
+
+# Load system rocm
 source /etc/profile.d/modules.sh
 module purge
 module load gcc/8.4.0
-module load rocm/5.3.0
+
+# These are necessary in order to see GPUs with sbatch
+unset ROCR_VISIBLE_DEVICES
+unset CUDA_VISIBLE_DEVICES
+unset GPU_DEVICE_ORDINAL
+
+# Load spack generated modules
 source ./buildsystem/spack/incline/modules/dependencies.sh
+
diff --git a/buildsystem/spack/incline/env.sh b/buildsystem/spack/incline/env.sh
index 31d03fa4..757cc090 100644
--- a/buildsystem/spack/incline/env.sh
+++ b/buildsystem/spack/incline/env.sh
@@ -3,19 +3,22 @@
 source /etc/profile.d/modules.sh
 module purge
 
-# Load system python
+# Load system python and gcc
 module load python/miniconda4.12
 source /share/apps/python/miniconda4.12/etc/profile.d/conda.sh
+module load gcc/8.4.0
 
 # Define environment variables for where spack stores key files
 # For now, SPACK_INSTALL is the path where everything spack related is installed
 # If you want to modify the module install path, edit the spack.yaml manually
 BASE=/qfs/projects/exasgd/resolve/spack-ci
 export SPACK_INSTALL=$BASE/install
+export SPACK_MIRROR=$BASE/../$(whoami)/spack-mirror
 export SPACK_CACHE=$BASE/../$(whoami)/spack-cache
 export SPACK_DISABLE_LOCAL_CONFIG=1
-export SPACK_PYTHON=$(which python)
+export SPACK_PYTHON=$(which python3)
 
 export tempdir=$SPACK_CACHE
 export TMP=$SPACK_CACHE
 export TMPDIR=$SPACK_CACHE
+
diff --git a/buildsystem/spack/incline/install.sh b/buildsystem/spack/incline/install.sh
index 6494de6f..392562d8 100755
--- a/buildsystem/spack/incline/install.sh
+++ b/buildsystem/spack/incline/install.sh
@@ -9,8 +9,13 @@
 #SBATCH -e spack_install.%J.output
 #SBTACH -t 240
 
+export HTTPS_PROXY=http://proxy01.pnl.gov:3128
+export https_proxy=http://proxy01.pnl.gov:3128
 export MY_CLUSTER=incline
 . buildsystem/load-spack.sh &&
 spack develop --no-clone --path=$(pwd) resolve@develop &&
+spack concretize -f &&
+spack install -j 64 llvm-amdgpu &&
+spack load llvm-amdgpu &&
 ./buildsystem/configure-modules.sh 64
 
diff --git a/buildsystem/spack/incline/modules/dependencies.sh b/buildsystem/spack/incline/modules/dependencies.sh
index 75b564ff..75cf6209 100644
--- a/buildsystem/spack/incline/modules/dependencies.sh
+++ b/buildsystem/spack/incline/modules/dependencies.sh
@@ -1,24 +1,170 @@
 module use -a /qfs/projects/exasgd/resolve/spack-ci/install/modules/linux-centos7-zen
+# curl@=7.29.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen
+module load curl/7.29.0-gcc-8.4.0-3emq5yx
+# gmake@=4.4.1%gcc@=8.4.0~guile build_system=generic arch=linux-centos7-zen
+module load gmake/4.4.1-gcc-8.4.0-l7nyr34
 # pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
-module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo
-# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
-module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g
-# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
-module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig
-# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen
-module load perl/5.26.0-gcc-8.4.0-h324qox
-# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen
-module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr
-# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen
-module load openssl/3.1.3-gcc-8.4.0-46yttzm
-# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen
-module load curl/8.4.0-gcc-8.4.0-g2rrs23
+module load pkgconf/1.9.5-gcc-8.4.0-733ltud
 # ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen
-module load ncurses/6.4-gcc-8.4.0-jt7rpqq
+module load ncurses/6.4-gcc-8.4.0-gwo76of
+# zlib-ng@=2.1.4%gcc@=8.4.0+compat+opt build_system=autotools arch=linux-centos7-zen
+module load zlib-ng/2.1.4-gcc-8.4.0-feah6zt
 # cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen
-module load cmake/3.27.7-gcc-8.4.0-tu2rruq
-# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen
-module load gmake/4.4.1-gcc-8.4.0-f23wik2
+module load cmake/3.27.7-gcc-8.4.0-rmou7zf
+# gmake@=4.4.1%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~guile build_system=generic arch=linux-centos7-zen
+module load gmake/4.4.1-clang-16.0.0-rocm5.6.0-6c7b35p
+# python@=3.9.12%gcc@=8.4.0+bz2+crypt+ctypes+dbm~debug+libxml2+lzma~nis~optimizations+pic+pyexpat+pythoncmd+readline+shared+sqlite3+ssl~tkinter+uuid+zlib build_system=generic patches=0d98e93,4c24573,ebdca64,f2fd060 arch=linux-centos7-zen
+module load python/3.9.12-gcc-8.4.0-ob2n5zs
+# re2c@=2.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
+module load re2c/2.2-gcc-8.4.0-zmj4cst
+# ninja@=1.11.1%gcc@=8.4.0+re2c build_system=generic arch=linux-centos7-zen
+module load ninja/1.11.1-gcc-8.4.0-ofxvwff
+# z3@=4.11.2%gcc@=8.4.0~gmp~ipo~python build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load z3/4.11.2-gcc-8.4.0-363odap
+# llvm-amdgpu@=5.6.1%gcc@=8.4.0~ipo~link_llvm_dylib~llvm_dylib~openmp+rocm-device-libs build_system=cmake build_type=Release generator=ninja patches=a08bbe1,b66529f,d35aec9 arch=linux-centos7-zen
+module load llvm-amdgpu/5.6.1-gcc-8.4.0-vy3wrnq
+# rocm-core@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocm-core/5.6.1-gcc-8.4.0-llv2yv4
+# rocm-cmake@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocm-cmake/5.6.1-gcc-8.4.0-klwq5kk
+# comgr@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load comgr/5.6.1-gcc-8.4.0-yl7z2re
+# mesa@=23.0.2%gcc@=8.4.0+glx+llvm+opengl~opengles+osmesa~strip build_system=meson buildtype=release default_library=shared arch=linux-centos7-zen
+module load mesa/23.0.2-gcc-8.4.0-xffioaq
+# glx@=1.4%gcc@=8.4.0 build_system=bundle arch=linux-centos7-zen
+module load glx/1.4-gcc-8.4.0-vh5g6sx
+# hipify-clang@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make patches=54b8b39 arch=linux-centos7-zen
+module load hipify-clang/5.6.1-gcc-8.4.0-e3jea5v
+# libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen
+module load libiconv/1.17-gcc-8.4.0-o2hwfiz
+# diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load diffutils/3.9-gcc-8.4.0-7ceszkk
+# bzip2@=1.0.8%gcc@=8.4.0~debug~pic+shared build_system=generic arch=linux-centos7-zen
+module load bzip2/1.0.8-gcc-8.4.0-on73m5o
+# xz@=5.4.1%gcc@=8.4.0~pic build_system=autotools libs=shared,static arch=linux-centos7-zen
+module load xz/5.4.1-gcc-8.4.0-v5kymdq
+# libxml2@=2.10.3%gcc@=8.4.0+pic~python+shared build_system=autotools arch=linux-centos7-zen
+module load libxml2/2.10.3-gcc-8.4.0-6mgqxiy
+# pigz@=2.7%gcc@=8.4.0 build_system=makefile arch=linux-centos7-zen
+module load pigz/2.7-gcc-8.4.0-btbzuey
+# zstd@=1.5.5%gcc@=8.4.0+programs build_system=makefile compression=none libs=shared,static arch=linux-centos7-zen
+module load zstd/1.5.5-gcc-8.4.0-3ets7dy
+# tar@=1.34%gcc@=8.4.0 build_system=autotools zip=pigz arch=linux-centos7-zen
+module load tar/1.34-gcc-8.4.0-atzwdgy
+# gettext@=0.22.3%gcc@=8.4.0+bzip2+curses+git~libunistring+libxml2+pic+shared+tar+xz build_system=autotools arch=linux-centos7-zen
+module load gettext/0.22.3-gcc-8.4.0-m33ujza
+# libsigsegv@=2.14%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load libsigsegv/2.14-gcc-8.4.0-gzna4n3
+# m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen
+module load m4/1.4.19-gcc-8.4.0-bwzchwl
+# elfutils@=0.189%gcc@=8.4.0~debuginfod+exeprefix+nls build_system=autotools arch=linux-centos7-zen
+module load elfutils/0.189-gcc-8.4.0-23kjwto
+# libtool@=2.4.7%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load libtool/2.4.7-gcc-8.4.0-2bmpsy4
+# util-macros@=1.19.3%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load util-macros/1.19.3-gcc-8.4.0-64inrmm
+# libpciaccess@=0.17%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load libpciaccess/0.17-gcc-8.4.0-sh2c4la
+# libpthread-stubs@=0.4%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load libpthread-stubs/0.4-gcc-8.4.0-kcav646
+# py-pip@=23.1.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
+module load py-pip/23.1.2-gcc-8.4.0-yajovh7
+# py-wheel@=0.41.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
+module load py-wheel/0.41.2-gcc-8.4.0-dkkw2va
+# py-setuptools@=68.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-setuptools/68.0.0-gcc-8.4.0-ihu4sfq
+# meson@=1.2.2%gcc@=8.4.0 build_system=python_pip patches=0f0b1bd,ae59765 arch=linux-centos7-zen
+module load meson/1.2.2-gcc-8.4.0-vcdwjmb
+# libdrm@=2.4.115%gcc@=8.4.0~docs build_system=generic arch=linux-centos7-zen
+module load libdrm/2.4.115-gcc-8.4.0-6h77lxh
+# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen
+module load perl/5.26.0-gcc-8.4.0-6tdzqfd
+# autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen
+module load autoconf/2.69-gcc-8.4.0-dcrbb7h
+# automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load automake/1.16.5-gcc-8.4.0-tvi3cks
+# numactl@=2.0.14%gcc@=8.4.0 build_system=autotools patches=4e1d78c,62fc8a8,ff37630 arch=linux-centos7-zen
+module load numactl/2.0.14-gcc-8.4.0-7mpcwqq
+# hsakmt-roct@=5.6.1%gcc@=8.4.0~ipo+shared build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load hsakmt-roct/5.6.1-gcc-8.4.0-4on3xib
+# hsa-rocr-dev@=5.6.1%gcc@=8.4.0~image~ipo+shared build_system=cmake build_type=Release generator=make patches=9267179 arch=linux-centos7-zen
+module load hsa-rocr-dev/5.6.1-gcc-8.4.0-tdlpv7w
+# perl-file-which@=1.27%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen
+module load perl-file-which/1.27-gcc-8.4.0-nix64yx
+# perl-module-build@=0.4232%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen
+module load perl-module-build/0.4232-gcc-8.4.0-ayed35p
+# perl-uri-encode@=1.1.1%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen
+module load perl-uri-encode/1.1.1-gcc-8.4.0-biqataj
+# py-ply@=3.11%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-ply/3.11-gcc-8.4.0-creftnl
+# py-cppheaderparser@=2.7.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-cppheaderparser/2.7.4-gcc-8.4.0-nw7554i
+# rocminfo@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocminfo/5.6.1-gcc-8.4.0-5shaxxj
+# roctracer-dev-api@=5.6.1%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
+module load roctracer-dev-api/5.6.1-gcc-8.4.0-gbaoh25
+# hip@=5.6.1%gcc@=8.4.0~cuda~ipo+rocm build_system=cmake build_type=Release generator=make patches=aee7249,c2ee21c,e73e91b arch=linux-centos7-zen
+module load hip/5.6.1-gcc-8.4.0-zpa2j7f
+# msgpack-c@=3.1.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load msgpack-c/3.1.1-gcc-8.4.0-buxbznu
+# procps@=4.0.4%gcc@=8.4.0+nls build_system=autotools arch=linux-centos7-zen
+module load procps/4.0.4-gcc-8.4.0-gyn6his
+# py-joblib@=1.2.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-joblib/1.2.0-gcc-8.4.0-ukcd432
+# py-cython@=0.29.36%gcc@=8.4.0 build_system=python_pip patches=c4369ad arch=linux-centos7-zen
+module load py-cython/0.29.36-gcc-8.4.0-5f4zyzb
+# py-msgpack@=1.0.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-msgpack/1.0.5-gcc-8.4.0-2xh5udm
+# libyaml@=0.2.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load libyaml/0.2.5-gcc-8.4.0-hidc7bw
+# py-pyyaml@=6.0%gcc@=8.4.0+libyaml build_system=python_pip arch=linux-centos7-zen
+module load py-pyyaml/6.0-gcc-8.4.0-4mdsdw2
+# py-distlib@=0.3.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-distlib/0.3.7-gcc-8.4.0-f25ay4b
+# py-editables@=0.3%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-editables/0.3-gcc-8.4.0-hrmamrk
+# py-flit-core@=3.9.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-flit-core/3.9.0-gcc-8.4.0-q3yng6k
+# py-packaging@=23.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-packaging/23.1-gcc-8.4.0-7krugqt
+# py-pathspec@=0.11.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-pathspec/0.11.1-gcc-8.4.0-vm5freh
+# git@=2.42.0%gcc@=8.4.0+man+nls+perl+subtree~svn~tcltk build_system=autotools arch=linux-centos7-zen
+module load git/2.42.0-gcc-8.4.0-k5crf2q
+# py-tomli@=2.0.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-tomli/2.0.1-gcc-8.4.0-m4gh2nb
+# py-typing-extensions@=4.8.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-typing-extensions/4.8.0-gcc-8.4.0-ovqdpbs
+# py-setuptools-scm@=7.1.0%gcc@=8.4.0+toml build_system=python_pip arch=linux-centos7-zen
+module load py-setuptools-scm/7.1.0-gcc-8.4.0-hqzn5lb
+# py-pluggy@=1.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-pluggy/1.0.0-gcc-8.4.0-lqpf66l
+# py-calver@=2022.6.26%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-calver/2022.6.26-gcc-8.4.0-pm6rj2c
+# py-trove-classifiers@=2023.8.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-trove-classifiers/2023.8.7-gcc-8.4.0-iy66qnh
+# py-hatchling@=1.18.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-hatchling/1.18.0-gcc-8.4.0-bjpjiiq
+# py-hatch-vcs@=0.3.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-hatch-vcs/0.3.0-gcc-8.4.0-hc6rq3a
+# py-filelock@=3.12.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-filelock/3.12.4-gcc-8.4.0-rzqmlrq
+# py-platformdirs@=3.10.0%gcc@=8.4.0~wheel build_system=python_pip arch=linux-centos7-zen
+module load py-platformdirs/3.10.0-gcc-8.4.0-6hnyp7h
+# py-virtualenv@=20.24.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen
+module load py-virtualenv/20.24.5-gcc-8.4.0-h4mzkzl
+# rocblas@=5.6.1%gcc@=8.4.0~ipo+tensile amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocblas/5.6.1-gcc-8.4.0-arsno2b
+# fmt@=10.1.1%gcc@=8.4.0~ipo+pic~shared build_system=cmake build_type=Release cxxstd=11 generator=make arch=linux-centos7-zen
+module load fmt/10.1.1-gcc-8.4.0-4d5ehr5
+# rocprim@=5.6.1%gcc@=8.4.0~ipo amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocprim/5.6.1-gcc-8.4.0-nu465tt
+# rocsparse@=5.6.1%gcc@=8.4.0~ipo~test amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocsparse/5.6.1-gcc-8.4.0-wtmfgyn
+# rocsolver@=5.6.1%gcc@=8.4.0~ipo+optimal amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load rocsolver/5.6.1-gcc-8.4.0-wlgpkqj
+# roctracer-dev@=5.6.1%gcc@=8.4.0~ipo~rocm build_system=cmake build_type=Release generator=make arch=linux-centos7-zen
+module load roctracer-dev/5.6.1-gcc-8.4.0-lilld4h
 # libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen
 module load libiconv/1.17-gcc-8.4.0-wfdnlg6
 # diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
@@ -27,6 +173,8 @@ module load diffutils/3.9-gcc-8.4.0-qh566r6
 module load libsigsegv/2.14-gcc-8.4.0-iutj4de
 # m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen
 module load m4/1.4.19-gcc-8.4.0-x7ktvaf
+# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen
+module load perl/5.26.0-gcc-8.4.0-h324qox
 # autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen
 module load autoconf/2.69-gcc-8.4.0-npluk5j
 # automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
@@ -35,6 +183,24 @@ module load automake/1.16.5-gcc-8.4.0-tgloywk
 module load libtool/2.4.7-gcc-8.4.0-gs6gyy3
 # gmp@=6.2.1%gcc@=8.4.0+cxx build_system=autotools libs=shared,static patches=69ad2e2 arch=linux-centos7-zen
 module load gmp/6.2.1-gcc-8.4.0-ythx4o2
+# pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo
+# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g
+# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen
+module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig
+# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen
+module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr
+# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen
+module load openssl/3.1.3-gcc-8.4.0-46yttzm
+# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen
+module load curl/8.4.0-gcc-8.4.0-g2rrs23
+# ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen
+module load ncurses/6.4-gcc-8.4.0-jt7rpqq
+# cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen
+module load cmake/3.27.7-gcc-8.4.0-tu2rruq
+# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen
+module load gmake/4.4.1-gcc-8.4.0-f23wik2
 # metis@=5.1.0%gcc@=8.4.0~gdb~int64~ipo~real64+shared build_system=cmake build_type=Release generator=make patches=4991da9,93a7903,b1225da arch=linux-centos7-zen
 module load metis/5.1.0-gcc-8.4.0-gsllf6a
 # autoconf-archive@=2023.02.20%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
@@ -61,5 +227,5 @@ module load mpfr/4.2.0-gcc-8.4.0-cjhi2el
 module load openblas/0.3.24-gcc-8.4.0-4ei4hpg
 # suite-sparse@=5.13.0%gcc@=8.4.0~cuda~graphblas~openmp+pic build_system=generic arch=linux-centos7-zen
 module load suite-sparse/5.13.0-gcc-8.4.0-ivey23b
-# resolve@=develop%gcc@=8.4.0~cuda~ipo+klu build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen
-## module load resolve/develop-gcc-8.4.0-ugoj3p3
+# resolve@=develop%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~cuda~ipo+klu+rocm amdgpu_target=gfx908 build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen
+## module load resolve/develop-clang-16.0.0-rocm5.6.0-6kaaut4
diff --git a/buildsystem/spack/incline/spack.yaml b/buildsystem/spack/incline/spack.yaml
index 36234ce0..894daf7c 100644
--- a/buildsystem/spack/incline/spack.yaml
+++ b/buildsystem/spack/incline/spack.yaml
@@ -1,10 +1,35 @@
 spack:
   specs:
-    - resolve~cuda%gcc@8.4.0
+  - resolve~cuda+rocm%clang@16.0.0-rocm5.6.0 amdgpu_target=gfx908
+    ^ llvm-amdgpu%gcc
+    ^ hsa-rocr-dev~image
   view: false
   concretizer:
-    unify: when_possible
     reuse: true
+    unify: true
+  compilers:
+  - compiler:
+      spec: gcc@8.4.0
+      paths:
+        cc: /share/apps/gcc/8.4.0/bin/gcc
+        cxx: /share/apps/gcc/8.4.0/bin/g++
+        f77: /share/apps/gcc/8.4.0/bin/gfortran
+        fc: /share/apps/gcc/8.4.0/bin/gfortran
+      operating_system: centos7
+      target: x86_64
+      modules: [gcc/8.4.0]
+  - compiler:
+      spec: clang@16.0.0-rocm5.6.0
+      paths:
+        cc: amdclang
+        cxx: amdclang++
+        f77: /share/apps/gcc/8.4.0/bin/gfortran
+        fc: /share/apps/gcc/8.4.0/bin/gfortran
+      flags:
+        cxxflags: --gcc-toolchain=/share/apps/gcc/8.4.0
+      operating_system: centos7
+      target: x86_64
+      modules: []
   config:
     concretizer: clingo
     install_tree:
@@ -23,20 +48,39 @@ spack:
         write: group
         read: world
         group: exasgd
+    mesa:
+      externals:
+      - spec: mesa@23.0.2+glx
+        prefix: /usr
+      buildable: false
+    curl:
+      externals:
+      - spec: curl@7.29.0
+        prefix: /usr/bin/curl
+      buildable: false
+    git:
+      externals:
+      - spec: git@2.42.0
+        prefix: /share/apps/git/2.42.0
+        modules:
+        - git/2.42.0
+      buildable: false
+    lua:
+      externals:
+      - spec: lua@5.4.2
+        modules:
+        - lua/5.4.2
+      buildable: false
+    python:
+      externals:
+      - spec: python@3.9.12%gcc
+        modules:
+        - python/miniconda4.12 
+      buildable: false
     perl:
       externals:
       - spec: perl@5.26.0
         modules:
         - perl/5.26.0
       buildable: false
-  compilers:
-  - compiler:
-      spec: gcc@8.4.0
-      paths:
-        cc: /share/apps/gcc/8.4.0/bin/gcc
-        cxx: /share/apps/gcc/8.4.0/bin/g++
-        f77: /share/apps/gcc/8.4.0/bin/gfortran
-        fc: /share/apps/gcc/8.4.0/bin/gfortran
-      operating_system: centos7
-      target: x86_64
-      modules: [gcc/8.4.0]
+
diff --git a/buildsystem/spack/spack b/buildsystem/spack/spack
index 7e466f7d..f120cada 160000
--- a/buildsystem/spack/spack
+++ b/buildsystem/spack/spack
@@ -1 +1 @@
-Subproject commit 7e466f7d22839f034b1e542daf5d2b6ef8c568c4
+Subproject commit f120cada59dbc5115d94c2fce3cbffc946b72bb0
diff --git a/cmake/ReSolveConfig.cmake.in b/cmake/ReSolveConfig.cmake.in
index 47f9fe35..fd73d0c8 100644
--- a/cmake/ReSolveConfig.cmake.in
+++ b/cmake/ReSolveConfig.cmake.in
@@ -4,6 +4,10 @@
 
 include("${CMAKE_CURRENT_LIST_DIR}/ReSolveTargets.cmake")
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD @CMAKE_CXX_STANDARD@)
+endif()
+
 include(CheckLanguage)
 # This must come before enable_language(CUDA)
 if(@RESOLVE_USE_CUDA@)
@@ -15,14 +19,19 @@ if(@RESOLVE_USE_CUDA@)
    add_library(ReSolve::CUDA ALIAS ReSolve::resolve_backend_cuda)
 endif()
 if(@RESOLVE_USE_HIP@)
+  # TODO - This is a bit heavy-handed, but otherwise you get gcc which is not ideal
+  #      - if(NOT CMAKE_C_COMPILER) wasn't working at top of file...
+  set(CMAKE_C_COMPILER @CMAKE_C_COMPILER@)
+  set(CMAKE_CXX_COMPILER @CMAKE_CXX_COMPILER@)
   enable_language(HIP)
   check_language(HIP)
   find_package(hip REQUIRED)
-  find_package(hipblas REQUIRED)
+  find_package(rocblas REQUIRED)
+  find_package(rocsparse REQUIRED)
+  find_package(rocsolver REQUIRED)
   # This is just an agly hack to make HIP build work
   get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
-  message(STATUS "HIP include directories: ${hip_includes}")
-  include_directories(${hip_includes})
+  target_include_directories(ReSolve::resolve_hip INTERFACE $<BUILD_INTERFACE:${hip_includes}>)
   add_library(ReSolve::HIP ALIAS ReSolve::resolve_backend_hip)
 endif()
 
diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake
index d0d22395..b23d8021 100644
--- a/cmake/ReSolveFindHipLibraries.cmake
+++ b/cmake/ReSolveFindHipLibraries.cmake
@@ -6,13 +6,20 @@ add_library(resolve_hip INTERFACE)
 find_package(hip REQUIRED)
 find_package(rocblas REQUIRED)
 find_package(rocsparse REQUIRED)
+find_package(rocsolver REQUIRED)
 
-target_link_libraries(resolve_hip INTERFACE
+target_link_libraries(resolve_hip INTERFACE 
   hip::host 
   hip::device
   roc::rocblas
   roc::rocsparse
-  rocsolver
+  roc::rocsolver
 )
 
+get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES)
+
+target_include_directories(resolve_hip INTERFACE 
+  $<BUILD_INTERFACE:${hip_includes}>)
+
 install(TARGETS resolve_hip EXPORT ReSolveTargets)
+
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 9113ce17..faa53807 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -71,8 +71,11 @@ set(CONSUMER_PATH ${CMAKE_INSTALL_PREFIX}/share/examples)
 install(PROGRAMS test.sh DESTINATION ${CONSUMER_PATH})
 
 # Select consumer app
+# TODO - have an outer loop that adds a unique consumer test for each backend supproted
 if(RESOLVE_USE_CUDA)
   set(RESOLVE_CONSUMER_APP "testKLU_Rf_FGMRES.cpp")
+elseif(RESOLVE_USE_HIP)
+  set(RESOLVE_CONSUMER_APP "testKLU_RocSolver.cpp")
 else()
   set(RESOLVE_CONSUMER_APP "testKLU.cpp")
 endif()
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 47ce70de..b98c8234 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -36,6 +36,7 @@ set(ReSolve_HEADER_INSTALL
     LinSolver.hpp
     LinSolverDirectCuSolverGLU.hpp
     LinSolverDirectCuSolverRf.hpp
+    LinSolverDirectRocSolverRf.hpp
     LinSolverDirectKLU.hpp
     LinSolverIterativeFGMRES.hpp
     RefactorizationSolver.hpp
diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp
index bb623fb2..97c95526 100644
--- a/resolve/LinSolverDirectRocSolverRf.hpp
+++ b/resolve/LinSolverDirectRocSolverRf.hpp
@@ -8,7 +8,6 @@
 #include <rocblas/rocblas.h>
 #include <rocsolver/rocsolver.h>
 #include <hip/hip_runtime.h>
-#include <roctracer/roctx.h>
 
 namespace ReSolve 
 {
diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt
index 7105655c..16455315 100644
--- a/resolve/cpu/CMakeLists.txt
+++ b/resolve/cpu/CMakeLists.txt
@@ -19,10 +19,5 @@ set(ReSolve_CPU_HEADER_INSTALL
 add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC})
 target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger)
 
-target_include_directories(resolve_backend_cpu INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
-
 # install include headers
 install(FILES ${ReSolve_CPU_HEADER_INSTALL} DESTINATION include/resolve/cpu)
diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
index f97267bc..225ea3c6 100644
--- a/resolve/cuda/CMakeLists.txt
+++ b/resolve/cuda/CMakeLists.txt
@@ -27,10 +27,7 @@ set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA)
 add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC})
 target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger)
 target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda)
-target_include_directories(resolve_backend_cuda INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
 
 # install include headers
 install(FILES ${ReSolve_CUDA_HEADER_INSTALL} DESTINATION include/resolve/cuda)
+
diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt
index f8d7a457..fb71a3bd 100644
--- a/resolve/hip/CMakeLists.txt
+++ b/resolve/hip/CMakeLists.txt
@@ -13,10 +13,10 @@ set(ReSolve_HIP_SRC
 )
 
 set(ReSolve_HIP_HEADER_INSTALL
-    # hipKernels.h
+    hipKernels.h
     hipVectorKernels.h
     HipMemory.hpp
-    # hip_check_errors.hpp
+    hip_check_errors.hpp
 )
 
 set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP)
@@ -27,11 +27,7 @@ set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP)
 add_library(resolve_backend_hip SHARED ${ReSolve_HIP_SRC})
 target_link_libraries(resolve_backend_hip PRIVATE resolve_logger)
 target_link_libraries(resolve_backend_hip PUBLIC resolve_hip)
-#target_include_directories(resolve_backend_hip PUBLIC ${hip_includes})
-target_include_directories(resolve_backend_hip INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
 
 # install include headers
 install(FILES ${ReSolve_HIP_HEADER_INSTALL} DESTINATION include/resolve/hip)
+
diff --git a/resolve/utilities/logger/CMakeLists.txt b/resolve/utilities/logger/CMakeLists.txt
index 91b29dfc..29800942 100644
--- a/resolve/utilities/logger/CMakeLists.txt
+++ b/resolve/utilities/logger/CMakeLists.txt
@@ -17,8 +17,9 @@ set(Logger_HEADER_INSTALL
 # Build shared library ReSolve
 add_library(resolve_logger SHARED ${Logger_SRC})
 
-target_include_directories(resolve_logger INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+target_include_directories(resolve_logger PUBLIC
+    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+    $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
     $<INSTALL_INTERFACE:include>
 )
 
diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt
index a34c2191..a44f74f8 100644
--- a/resolve/workspace/CMakeLists.txt
+++ b/resolve/workspace/CMakeLists.txt
@@ -47,9 +47,10 @@ if(RESOLVE_USE_HIP)
   target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip)
 endif(RESOLVE_USE_HIP)  
 
-target_include_directories(resolve_workspace INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>
+target_include_directories(resolve_workspace PUBLIC
+  $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>
+  $<BUILD_INTERFACE:${CMAKE_BINARY_DIR}>
+  $<INSTALL_INTERFACE:include>
 )
 
 # install include headers

From 68cb4f7dc481dbd863cfac04f6be5f0863f7b573 Mon Sep 17 00:00:00 2001
From: Cameron Rutherford <cameron.rutherford@me.com>
Date: Fri, 3 Nov 2023 12:31:14 -0400
Subject: [PATCH 12/12] Fix incline variables after merge

---
 .../spack/incline/modules/dependencies.sh     | 29 ++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/buildsystem/spack/incline/modules/dependencies.sh b/buildsystem/spack/incline/modules/dependencies.sh
index c1c55496..75cf6209 100644
--- a/buildsystem/spack/incline/modules/dependencies.sh
+++ b/buildsystem/spack/incline/modules/dependencies.sh
@@ -199,6 +199,33 @@ module load curl/8.4.0-gcc-8.4.0-g2rrs23
 module load ncurses/6.4-gcc-8.4.0-jt7rpqq
 # cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen
 module load cmake/3.27.7-gcc-8.4.0-tu2rruq
+# gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen
+module load gmake/4.4.1-gcc-8.4.0-f23wik2
+# metis@=5.1.0%gcc@=8.4.0~gdb~int64~ipo~real64+shared build_system=cmake build_type=Release generator=make patches=4991da9,93a7903,b1225da arch=linux-centos7-zen
+module load metis/5.1.0-gcc-8.4.0-gsllf6a
+# autoconf-archive@=2023.02.20%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load autoconf-archive/2023.02.20-gcc-8.4.0-ox4hxoe
+# bzip2@=1.0.8%gcc@=8.4.0~debug~pic+shared build_system=generic arch=linux-centos7-zen
+module load bzip2/1.0.8-gcc-8.4.0-3uzyl47
+# xz@=5.4.1%gcc@=8.4.0~pic build_system=autotools libs=shared,static arch=linux-centos7-zen
+module load xz/5.4.1-gcc-8.4.0-dwmuagy
+# libxml2@=2.10.3%gcc@=8.4.0+pic~python+shared build_system=autotools arch=linux-centos7-zen
+module load libxml2/2.10.3-gcc-8.4.0-2hu4ayt
+# pigz@=2.7%gcc@=8.4.0 build_system=makefile arch=linux-centos7-zen
+module load pigz/2.7-gcc-8.4.0-lu7bjb6
+# zstd@=1.5.5%gcc@=8.4.0+programs build_system=makefile compression=none libs=shared,static arch=linux-centos7-zen
+module load zstd/1.5.5-gcc-8.4.0-z7jmyvw
+# tar@=1.34%gcc@=8.4.0 build_system=autotools zip=pigz arch=linux-centos7-zen
+module load tar/1.34-gcc-8.4.0-wcgempy
+# gettext@=0.22.3%gcc@=8.4.0+bzip2+curses+git~libunistring+libxml2+pic+shared+tar+xz build_system=autotools arch=linux-centos7-zen
+module load gettext/0.22.3-gcc-8.4.0-f7dl6un
+# texinfo@=7.0.3%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen
+module load texinfo/7.0.3-gcc-8.4.0-jma4obj
+# mpfr@=4.2.0%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen
+module load mpfr/4.2.0-gcc-8.4.0-cjhi2el
+# openblas@=0.3.24%gcc@=8.4.0~bignuma~consistent_fpcsr+fortran~ilp64+locking+pic+shared build_system=makefile symbol_suffix=none threads=none arch=linux-centos7-zen
+module load openblas/0.3.24-gcc-8.4.0-4ei4hpg
+# suite-sparse@=5.13.0%gcc@=8.4.0~cuda~graphblas~openmp+pic build_system=generic arch=linux-centos7-zen
+module load suite-sparse/5.13.0-gcc-8.4.0-ivey23b
 # resolve@=develop%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~cuda~ipo+klu+rocm amdgpu_target=gfx908 build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen
 ## module load resolve/develop-clang-16.0.0-rocm5.6.0-6kaaut4
-