From 93f5ac11a46e554825ea5daf3a3219598ef061ee Mon Sep 17 00:00:00 2001
From: "Peles, Slaven" <peless@ornl.gov>
Date: Sat, 14 Oct 2023 02:00:08 +0000
Subject: [PATCH] Create backends for GPU memory management

---
 CMakeLists.txt                           |  56 ++++++---
 resolve/CMakeLists.txt                   |  52 +++++---
 resolve/LinAlgWorkspace.cpp              |   5 +-
 resolve/LinAlgWorkspace.hpp              |   5 +-
 resolve/LinSolverDirectCuSolverGLU.cpp   |   6 +-
 resolve/LinSolverDirectCuSolverGLU.hpp   |   3 +
 resolve/LinSolverDirectCuSolverRf.cpp    |  21 ++--
 resolve/LinSolverDirectCuSolverRf.hpp    |   3 +
 resolve/LinSolverIterativeFGMRES.cpp     |   3 +-
 resolve/LinSolverIterativeFGMRES.hpp     |   2 +
 resolve/MemoryUtils.hpp                  |  68 ++++++++++
 resolve/MemoryUtils.tpp                  |  75 +++++++++++
 resolve/cpu/CMakeLists.txt               |  25 ++++
 resolve/cpu/CpuMemory.hpp                | 134 ++++++++++++++++++++
 resolve/cpu/MemoryUtils.cpp              |  40 ++++++
 resolve/cuda/CMakeLists.txt              |  31 +++++
 resolve/cuda/CudaMemory.hpp              | 152 +++++++++++++++++++++++
 resolve/cuda/MemoryUtils.cu              |  40 ++++++
 resolve/{ => cuda}/cudaKernels.cu        |   0
 resolve/{ => cuda}/cudaKernels.h         |   0
 resolve/{ => cuda}/cuda_check_errors.hpp |   9 ++
 resolve/hip/.gitkeep                     |   0
 resolve/matrix/CMakeLists.txt            |   8 +-
 resolve/matrix/Coo.cpp                   |  49 ++++----
 resolve/matrix/Csc.cpp                   |  49 ++++----
 resolve/matrix/Csr.cpp                   |  50 ++++----
 resolve/matrix/MatrixHandler.cpp         |  13 +-
 resolve/matrix/MatrixHandler.hpp         |   3 +
 resolve/matrix/Sparse.cpp                |  16 +--
 resolve/matrix/Sparse.hpp                |   4 +
 resolve/memoryUtils.cu                   | 146 ----------------------
 resolve/memoryUtils.hpp                  | 117 -----------------
 resolve/resolve_defs.hpp.in              |  16 +++
 resolve/utilities/logger/CMakeLists.txt  |   2 +-
 resolve/vector/CMakeLists.txt            |  18 +--
 resolve/vector/Vector.cpp                |  37 +++---
 resolve/vector/Vector.hpp                |   3 +
 resolve/vector/VectorHandler.cpp         |   2 +-
 tests/CMakeLists.txt                     |   6 +-
 tests/unit/CMakeLists.txt                |   8 +-
 tests/unit/vector/GramSchmidtTests.hpp   |   4 +-
 41 files changed, 837 insertions(+), 444 deletions(-)
 create mode 100644 resolve/MemoryUtils.hpp
 create mode 100644 resolve/MemoryUtils.tpp
 create mode 100644 resolve/cpu/CMakeLists.txt
 create mode 100644 resolve/cpu/CpuMemory.hpp
 create mode 100644 resolve/cpu/MemoryUtils.cpp
 create mode 100644 resolve/cuda/CMakeLists.txt
 create mode 100644 resolve/cuda/CudaMemory.hpp
 create mode 100644 resolve/cuda/MemoryUtils.cu
 rename resolve/{ => cuda}/cudaKernels.cu (100%)
 rename resolve/{ => cuda}/cudaKernels.h (100%)
 rename resolve/{ => cuda}/cuda_check_errors.hpp (66%)
 create mode 100644 resolve/hip/.gitkeep
 delete mode 100644 resolve/memoryUtils.cu
 delete mode 100644 resolve/memoryUtils.hpp
 create mode 100644 resolve/resolve_defs.hpp.in

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 36ab2f37..5715e6e2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,7 +22,9 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
 endif()
 
 option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF)
-option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON)
+option(RESOLVE_USE_KLU  "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON)
+option(RESOLVE_USE_GPU  "Use GPU device for computations" ON)
+option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON)
 set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved")
 
 set(CMAKE_MACOSX_RPATH 1)
@@ -47,12 +49,6 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
 # TODO: Set up clang-format
 #include(./cmake/clang-format)
 
-# Configure CUDA
-include(CheckLanguage)
-enable_language(CUDA)
-check_language(CUDA)
-
-
 if (RESOLVE_USE_KLU)
   include(FindKLU)
   if(NOT KLU_LIBRARY)
@@ -63,22 +59,42 @@ else()
   message(STATUS "Not using SuiteSparse KLU")
 endif()
 
-if(NOT DEFINED CMAKE_CUDA_STANDARD)
-  set(CMAKE_CUDA_STANDARD 11)
-  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
-endif()
+include(CheckLanguage)
+
+# Configure CUDA
+if(RESOLVE_USE_CUDA)
+  enable_language(CUDA)
+  check_language(CUDA)
+
+  if(NOT DEFINED CMAKE_CUDA_STANDARD)
+    set(CMAKE_CUDA_STANDARD 11)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  endif()
 
-if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-  set(CMAKE_CUDA_ARCHITECTURES 60 CACHE STRING "Selects CUDA architectures")
+  if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+    set(CMAKE_CUDA_ARCHITECTURES 60 CACHE STRING "Selects CUDA architectures")
+  endif()
+
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+
+  include(ReSolveFindCudaLibraries)
+else()
+  message(STATUS "Not using CUDA")
 endif()
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
+# The binary dir is already a global include directory
+configure_file(
+  ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in
+  ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp)
 
+# include build directory for Fortran name mangling header
+include_directories(${CMAKE_BINARY_DIR})
 
-# Link in required cuda dependencies
-#find_package(CUDAToolkit REQUIRED)
+install(
+  FILES ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp
+  DESTINATION include/resolve
+  )
 
-include(ReSolveFindCudaLibraries)
 
 include_directories(${CMAKE_SOURCE_DIR})
 
@@ -114,8 +130,10 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake"
         "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfigVersion.cmake"
         DESTINATION share/resolve/cmake)
     
-# Add examples
-add_subdirectory(examples)
+# Add examples (for now only CUDA examples are available)
+if(RESOLVE_USE_CUDA)
+  add_subdirectory(examples)
+endif(RESOLVE_USE_CUDA)
 
 # Add tests
 add_subdirectory(tests)
diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt
index 49b50997..675fc11f 100644
--- a/resolve/CMakeLists.txt
+++ b/resolve/CMakeLists.txt
@@ -19,14 +19,8 @@ set(ReSolve_SRC
 )
 
 
-set(ReSolve_SRC_CUDA
-    cudaKernels.cu
-    memoryUtils.cu
-)
-
 set(ReSolve_HEADER_INSTALL
     Common.hpp
-    cudaKernels.h
     cusolver_defs.hpp
     LinAlgWorkspace.hpp
     LinSolver.hpp
@@ -37,18 +31,18 @@ set(ReSolve_HEADER_INSTALL
     RefactorizationSolver.hpp
     SystemSolver.hpp
     GramSchmidt.hpp
-    memoryUtils.hpp
+    MemoryUtils.hpp
 )
 
-set_source_files_properties(${ReSolve_SRC_CUDA} PROPERTIES LANGUAGE CUDA)
+# If GPU support is not enabled, add dummy device backend
+if(NOT RESOLVE_USE_GPU)
+    add_subdirectory(cpu)
+endif()
 
 # First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
-add_library(resolve_backend_cuda SHARED ${ReSolve_SRC_CUDA})
-target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda)
-target_include_directories(resolve_backend_cuda INTERFACE
-    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
-    $<INSTALL_INTERFACE:include>
-)
+if(RESOLVE_USE_CUDA)
+    add_subdirectory(cuda)
+endif()
 
 # Next build vector and matrix objects that may use this backend.
 add_subdirectory(vector)
@@ -57,9 +51,33 @@ add_subdirectory(matrix)
 
 # Build shared library ReSolve
 add_library(resolve_tpl INTERFACE)
-target_link_libraries(resolve_tpl INTERFACE resolve_cuda KLU)
 
-install(TARGETS resolve_matrix resolve_vector resolve_backend_cuda resolve_logger resolve_tpl EXPORT ReSolveTargets)
+if(RESOLVE_USE_KLU)
+    target_link_libraries(resolve_tpl INTERFACE KLU)    
+endif(RESOLVE_USE_KLU)
+
+if(RESOLVE_USE_CUDA)
+    target_link_libraries(resolve_tpl INTERFACE resolve_cuda)
+endif(RESOLVE_USE_CUDA)
+
+
+set(ReSolve_Targets_List
+    resolve_matrix
+    resolve_vector
+    resolve_logger
+    resolve_tpl
+)
+
+if(RESOLVE_USE_GPU)
+    if(RESOLVE_USE_CUDA)
+        set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda)
+    endif()
+else(RESOLVE_USE_GPU)
+    set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu)
+endif(RESOLVE_USE_GPU)
+
+
+install(TARGETS ${ReSolve_Targets_List} EXPORT ReSolveTargets)
 
 add_library(ReSolve SHARED ${ReSolve_SRC})
 
@@ -69,7 +87,7 @@ target_include_directories(ReSolve INTERFACE
 )
 
 # TODO: Make this PRIVATE dependency (requires refactoring ReSolve code)
-target_link_libraries(ReSolve PUBLIC resolve_matrix resolve_vector resolve_backend_cuda resolve_logger resolve_tpl)
+target_link_libraries(ReSolve PUBLIC ${ReSolve_Targets_List})
 
 install(TARGETS ReSolve
         EXPORT ReSolveTargets
diff --git a/resolve/LinAlgWorkspace.cpp b/resolve/LinAlgWorkspace.cpp
index fc38a2ec..9dee78c5 100644
--- a/resolve/LinAlgWorkspace.cpp
+++ b/resolve/LinAlgWorkspace.cpp
@@ -1,4 +1,3 @@
-#include <resolve/memoryUtils.hpp>
 #include "LinAlgWorkspace.hpp"
 
 namespace ReSolve
@@ -24,8 +23,8 @@ namespace ReSolve
 
   LinAlgWorkspaceCUDA::~LinAlgWorkspaceCUDA()
   {
-    if (buffer_spmv_ != nullptr)  deleteOnDevice(buffer_spmv_);
-    if (buffer_1norm_ != nullptr) deleteOnDevice(buffer_1norm_);
+    if (buffer_spmv_ != nullptr)  mem_.deleteOnDevice(buffer_spmv_);
+    if (buffer_1norm_ != nullptr) mem_.deleteOnDevice(buffer_1norm_);
     cusparseDestroy(handle_cusparse_);
     cusolverSpDestroy(handle_cusolversp_);
     cublasDestroy(handle_cublas_);
diff --git a/resolve/LinAlgWorkspace.hpp b/resolve/LinAlgWorkspace.hpp
index 7fe83aa9..e5c79580 100644
--- a/resolve/LinAlgWorkspace.hpp
+++ b/resolve/LinAlgWorkspace.hpp
@@ -4,6 +4,8 @@
 #include "cusparse.h"
 #include "cusolverSp.h"
 
+#include <resolve/MemoryUtils.hpp>
+
 namespace ReSolve
 {
   class LinAlgWorkspace
@@ -11,7 +13,8 @@ namespace ReSolve
     public:
       LinAlgWorkspace();
       ~LinAlgWorkspace();
-    private:
+    protected:
+      MemoryHandler mem_;
   };
 
 
diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp
index e8130610..63413c5b 100644
--- a/resolve/LinSolverDirectCuSolverGLU.cpp
+++ b/resolve/LinSolverDirectCuSolverGLU.cpp
@@ -1,6 +1,6 @@
 #include <cstring> // includes memcpy
 #include <vector>
-#include <resolve/memoryUtils.hpp>
+
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
 #include "LinSolverDirectCuSolverGLU.hpp"
@@ -14,7 +14,7 @@ namespace ReSolve
 
   LinSolverDirectCuSolverGLU::~LinSolverDirectCuSolverGLU()
   {
-    deleteOnDevice(glu_buffer_);
+    mem_.deleteOnDevice(glu_buffer_);
     cusparseDestroyMatDescr(descr_M_);
     cusparseDestroyMatDescr(descr_A_);
     cusolverSpDestroyGluInfo(info_M_);
@@ -64,7 +64,7 @@ namespace ReSolve
     status_cusolver_ = cusolverSpDgluBufferSize(handle_cusolversp_, info_M_, &buffer_size);
     error_sum += status_cusolver_; 
 
-    allocateBufferOnDevice(&glu_buffer_, buffer_size);
+    mem_.allocateBufferOnDevice(&glu_buffer_, buffer_size);
 
     status_cusolver_ = cusolverSpDgluAnalysis(handle_cusolversp_, info_M_, glu_buffer_);
     error_sum += status_cusolver_; 
diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp
index 657118fd..d76e1921 100644
--- a/resolve/LinSolverDirectCuSolverGLU.hpp
+++ b/resolve/LinSolverDirectCuSolverGLU.hpp
@@ -3,6 +3,7 @@
 #include <resolve/LinAlgWorkspace.hpp>
 #include "LinSolver.hpp"
 #include "cusolver_defs.hpp"
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve 
 {
@@ -45,5 +46,7 @@ namespace ReSolve
       void* glu_buffer_;
       double r_nrminf_;
       int ite_refine_succ_; 
+
+      MemoryHandler mem_; ///< Device memory manager object
   };
 }
diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp
index 602d6ec1..d51218cc 100644
--- a/resolve/LinSolverDirectCuSolverRf.cpp
+++ b/resolve/LinSolverDirectCuSolverRf.cpp
@@ -1,4 +1,3 @@
-#include <resolve/memoryUtils.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Csr.hpp>
 #include "LinSolverDirectCuSolverRf.hpp"
@@ -13,9 +12,9 @@ namespace ReSolve
   LinSolverDirectCuSolverRf::~LinSolverDirectCuSolverRf()
   {
     cusolverRfDestroy(handle_cusolverrf_);
-    deleteOnDevice(d_P_);
-    deleteOnDevice(d_Q_);
-    deleteOnDevice(d_T_);
+    mem_.deleteOnDevice(d_P_);
+    mem_.deleteOnDevice(d_Q_);
+    mem_.deleteOnDevice(d_T_);
   }
 
   int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q)
@@ -24,12 +23,12 @@ namespace ReSolve
     int error_sum = 0;
     this->A_ = (matrix::Csr*) A;
     index_type n = A_->getNumRows();
-    allocateArrayOnDevice(&d_P_, n); 
-    allocateArrayOnDevice(&d_Q_, n);
-    allocateArrayOnDevice(&d_T_, n);
+    mem_.allocateArrayOnDevice(&d_P_, n); 
+    mem_.allocateArrayOnDevice(&d_Q_, n);
+    mem_.allocateArrayOnDevice(&d_T_, n);
 
-    copyArrayHostToDevice(d_P_, P, n);
-    copyArrayHostToDevice(d_Q_, Q, n);
+    mem_.copyArrayHostToDevice(d_P_, P, n);
+    mem_.copyArrayHostToDevice(d_Q_, Q, n);
 
 
     status_cusolverrf_ = cusolverRfSetResetValuesFastMode(handle_cusolverrf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON);
@@ -52,7 +51,7 @@ namespace ReSolve
                                                handle_cusolverrf_);
     error_sum += status_cusolverrf_;
 
-    deviceSynchronize();
+    mem_.deviceSynchronize();
     status_cusolverrf_ = cusolverRfAnalyze(handle_cusolverrf_);
     error_sum += status_cusolverrf_;
 
@@ -85,7 +84,7 @@ namespace ReSolve
                                                handle_cusolverrf_);
     error_sum += status_cusolverrf_;
 
-    deviceSynchronize();
+    mem_.deviceSynchronize();
     status_cusolverrf_ =  cusolverRfRefactor(handle_cusolverrf_);
     error_sum += status_cusolverrf_;
 
diff --git a/resolve/LinSolverDirectCuSolverRf.hpp b/resolve/LinSolverDirectCuSolverRf.hpp
index 478dfaa6..f0ee755e 100644
--- a/resolve/LinSolverDirectCuSolverRf.hpp
+++ b/resolve/LinSolverDirectCuSolverRf.hpp
@@ -2,6 +2,7 @@
 #include "Common.hpp"
 #include "LinSolver.hpp"
 #include "cusolverRf.h"
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve 
 {
@@ -40,5 +41,7 @@ namespace ReSolve
       index_type* d_P_;
       index_type* d_Q_;
       real_type* d_T_;
+
+      MemoryHandler mem_; ///< Device memory manager object
   };
 }
diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp
index 28b89334..2a6da732 100644
--- a/resolve/LinSolverIterativeFGMRES.cpp
+++ b/resolve/LinSolverIterativeFGMRES.cpp
@@ -3,7 +3,6 @@
 #include <cmath>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/memoryUtils.hpp>
 #include <resolve/matrix/MatrixHandler.hpp>
 #include "LinSolverIterativeFGMRES.hpp"
 
@@ -170,7 +169,7 @@ namespace ReSolve
         vec_v->setData( d_V_->getVectorData(i, "cuda"), "cuda");
         vec_z->setData( d_Z_->getVectorData(i, "cuda"), "cuda");
         this->precV(vec_v, vec_z);
-        deviceSynchronize();
+        mem_.deviceSynchronize();
 
         // V_{i+1}=A*Z_i
 
diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp
index fb0baec9..8b2c722d 100644
--- a/resolve/LinSolverIterativeFGMRES.hpp
+++ b/resolve/LinSolverIterativeFGMRES.hpp
@@ -70,5 +70,7 @@ namespace ReSolve
     real_type final_residual_norm_;
     real_type initial_residual_norm_;
     index_type fgmres_iters_;
+
+    MemoryHandler mem_; ///< Device memory manager object
   };
 }
diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp
new file mode 100644
index 00000000..77c2a7af
--- /dev/null
+++ b/resolve/MemoryUtils.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <resolve/resolve_defs.hpp>
+
+namespace ReSolve
+{
+  /**
+   * @class MemoryUtils
+   * 
+   * @brief Provides basic memory allocation, free and copy functions.
+   * 
+   * This class provedes abstractions for memory management functiosn for
+   * different GPU programming models.
+   * 
+   * @tparam Policy - Memory management policy (vendor specific)
+   * 
+   * @author Slaven Peles <peless@ornl.gov>
+   */
+  template <class Policy>
+  class MemoryUtils
+  {
+    public:
+      MemoryUtils()  = default;
+      ~MemoryUtils() = default;
+
+      void deviceSynchronize();
+      int getLastDeviceError();
+      int deleteOnDevice(void* v);
+      
+      template <typename I, typename T>
+      int allocateArrayOnDevice(T** v, I n);
+      
+      template <typename I, typename T>
+      int allocateBufferOnDevice(T** v, I n);
+      
+      template <typename I, typename T>
+      int setZeroArrayOnDevice(T* v, I n);
+      
+      template <typename I, typename T>
+      int copyArrayDeviceToHost(T* dst, const T* src, I n);
+      
+      template <typename I, typename T>
+      int copyArrayDeviceToDevice(T* dst, const T* src, I n);
+      
+      template <typename I, typename T>
+      int copyArrayHostToDevice(T* dst, const T* src, I n);
+  };
+
+} // namespace ReSolve
+
+// Check if GPU support is enabled in Re::Solve and set appropriate device memory manager.
+#ifdef RESOLVE_USE_GPU
+
+#if defined RESOLVE_USE_CUDA
+#include <resolve/cuda/CudaMemory.hpp>
+using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Cuda>;
+#elif defined RESOLVE_USE_HIP
+#error HIP support requested, but not available! Probably a bug in CMake configuration.
+#endif
+
+#else
+
+// If no GPU support is present, set device memory manager to a dummy object.
+#include <resolve/cpu/CpuMemory.hpp>
+using MemoryHandler = ReSolve::MemoryUtils<ReSolve::memory::Cpu>;
+
+#endif
+
diff --git a/resolve/MemoryUtils.tpp b/resolve/MemoryUtils.tpp
new file mode 100644
index 00000000..936a3336
--- /dev/null
+++ b/resolve/MemoryUtils.tpp
@@ -0,0 +1,75 @@
+/**
+ * @file MemoryUtils.tpp
+ * 
+ * Contains implementation of memory utility functions wrappers.
+ * All it does it calls vendor specific functions frm an abstract interface.
+ * 
+ * @author Slaven Peles <peless@ornl.gov>
+ */
+
+#pragma once
+
+
+namespace ReSolve
+{
+    template <class Policy>
+    void MemoryUtils<Policy>::deviceSynchronize()
+    {
+      Policy::deviceSynchronize();
+    }
+
+    template <class Policy>
+    int MemoryUtils<Policy>::getLastDeviceError()
+    {
+      return Policy::getLastDeviceError();
+    }
+
+    template <class Policy>
+    int MemoryUtils<Policy>::deleteOnDevice(void* v)
+    {
+      return Policy::deleteOnDevice(v);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::allocateArrayOnDevice(T** v, I n)
+    {
+      return Policy::template allocateArrayOnDevice<I, T>(v, n);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::allocateBufferOnDevice(T** v, I n)
+    {
+      return Policy::template allocateBufferOnDevice<I, T>(v, n);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::setZeroArrayOnDevice(T* v, I n)
+    {
+      return Policy::template setZeroArrayOnDevice<I, T>(v, n);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::copyArrayDeviceToHost(T* dst, const T* src, I n)
+    {
+      return Policy::template copyArrayDeviceToHost<I, T>(dst, src, n);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::copyArrayDeviceToDevice(T* dst, const T* src, I n)
+    {
+      return Policy::template copyArrayDeviceToDevice<I, T>(dst, src, n);
+    }
+    
+    template <class Policy>
+    template <typename I, typename T>
+    int MemoryUtils<Policy>::copyArrayHostToDevice(T* dst, const T* src, I n)
+    {
+      return Policy::template copyArrayHostToDevice<I, T>(dst, src, n);
+    }
+
+} // namespace ReSolve
\ No newline at end of file
diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt
new file mode 100644
index 00000000..cb84dc55
--- /dev/null
+++ b/resolve/cpu/CMakeLists.txt
@@ -0,0 +1,25 @@
+#[[
+
+@brief Build ReSolve backend when there is no GPU support
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+set(ReSolve_CPU_SRC
+    MemoryUtils.cpp
+)
+
+set(ReSolve_CPU_HEADER_INSTALL
+    CpuMemory.hpp
+)
+
+# First create dummy backend
+add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC})
+target_include_directories(resolve_backend_cpu INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+# install include headers
+install(FILES ${ReSolve_CPU_HEADER_INSTALL} DESTINATION include/resolve/cpu)
diff --git a/resolve/cpu/CpuMemory.hpp b/resolve/cpu/CpuMemory.hpp
new file mode 100644
index 00000000..89a9982d
--- /dev/null
+++ b/resolve/cpu/CpuMemory.hpp
@@ -0,0 +1,134 @@
+#pragma once
+
+#include <resolve/utilities/logger/Logger.hpp>
+
+namespace ReSolve
+{
+  namespace memory
+  {
+    /**
+     * @brief Class containing dummy functions when there is no GPU support.
+     * 
+     * @author Slaven Peles <peless@ornl.gov>
+     */
+    struct Cpu
+    {
+      /**
+       * @brief Dummy function to stand in when GPU support is not enabled.
+       */
+      static void deviceSynchronize()
+      {
+        // Nothing to synchronize
+      }
+      
+      /**
+       * @brief Dummy function to stand in when GPU support is not enabled.
+       * 
+       * @return Allways return success!
+       */
+      static int getLastDeviceError()
+      {
+        // not on device, nothing to get
+        return 0;
+      }
+      
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      static int deleteOnDevice(void* /* v */)
+      {
+        ReSolve::io::Logger::error() << "Trying to delete on a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      template <typename I, typename T>
+      static int allocateArrayOnDevice(T** /* v */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to allocate on a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      template <typename I, typename T>
+      static int allocateBufferOnDevice(T** /* v */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to allocate on a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      template <typename I, typename T>
+      static int setZeroArrayOnDevice(T* /* v */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to initialize array on a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToHost(T* /* dst */, const T* /* src */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to copy from a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      /** 
+       * @brief Dummy function to notify us something is wrong.
+       * 
+       * This will be called only if GPU device support is not built, so
+       * trying to access a device should indicate a bug in the code.
+       *
+       * @return Allways return failure!
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToDevice(T* /* dst */, const T* /* src */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to copy to a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+      template <typename I, typename T>
+      static int copyArrayHostToDevice(T* /* dst */, const T* /* src */, I /* n */)
+      {
+        ReSolve::io::Logger::error() << "Trying to copy to a GPU device, but GPU support not available.\n";
+        return -1;
+      }
+
+    }; // struct Cuda
+  } // namespace memory
+
+} //namespace ReSolve
diff --git a/resolve/cpu/MemoryUtils.cpp b/resolve/cpu/MemoryUtils.cpp
new file mode 100644
index 00000000..03e913bc
--- /dev/null
+++ b/resolve/cpu/MemoryUtils.cpp
@@ -0,0 +1,40 @@
+/**
+ * @file MemoryUtils.cpp
+ * 
+ * This file includes MemoryUtils.tpp and specifies what functions to
+ * instantiate from function templates.
+ * 
+ * @author Slaven Peles <peless@ornl.gov>
+ */
+
+
+#include <iostream>
+
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+#include <resolve/cpu/CpuMemory.hpp>
+
+#include <resolve/MemoryUtils.tpp>
+
+namespace ReSolve
+{
+  template void MemoryUtils<memory::Cpu>::deviceSynchronize();
+  template int  MemoryUtils<memory::Cpu>::getLastDeviceError();
+  template int  MemoryUtils<memory::Cpu>::deleteOnDevice(void*);
+
+  template int  MemoryUtils<memory::Cpu>::allocateArrayOnDevice<index_type,  real_type>( real_type**, index_type);
+  template int  MemoryUtils<memory::Cpu>::allocateArrayOnDevice<index_type, index_type>(index_type**, index_type);
+
+  template int  MemoryUtils<memory::Cpu>::allocateBufferOnDevice<size_t, void>(void** v, size_t n);
+
+  template int  MemoryUtils<memory::Cpu>::setZeroArrayOnDevice<index_type, real_type>( real_type*, index_type);
+
+  template int  MemoryUtils<memory::Cpu>::copyArrayDeviceToHost<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int  MemoryUtils<memory::Cpu>::copyArrayDeviceToHost<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int  MemoryUtils<memory::Cpu>::copyArrayDeviceToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int  MemoryUtils<memory::Cpu>::copyArrayDeviceToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int  MemoryUtils<memory::Cpu>::copyArrayHostToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int  MemoryUtils<memory::Cpu>::copyArrayHostToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+}
diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt
new file mode 100644
index 00000000..d3ead313
--- /dev/null
+++ b/resolve/cuda/CMakeLists.txt
@@ -0,0 +1,31 @@
+#[[
+
+@brief Build ReSolve CUDA backend
+
+@author Slaven Peles <peless@ornl.gov>
+
+]]
+
+set(ReSolve_CUDA_SRC
+    cudaKernels.cu
+    MemoryUtils.cu
+)
+
+set(ReSolve_CUDA_HEADER_INSTALL
+    cudaKernels.h
+    CudaMemory.hpp
+    cuda_check_errors.hpp
+)
+
+set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA)
+
+# First create CUDA backend (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK)
+add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC})
+target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda)
+target_include_directories(resolve_backend_cuda INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+# install include headers
+install(FILES ${ReSolve_CUDA_HEADER_INSTALL} DESTINATION include/resolve/cuda)
diff --git a/resolve/cuda/CudaMemory.hpp b/resolve/cuda/CudaMemory.hpp
new file mode 100644
index 00000000..a56ef37d
--- /dev/null
+++ b/resolve/cuda/CudaMemory.hpp
@@ -0,0 +1,152 @@
+#pragma once
+
+#include <iostream>
+#include <cuda_runtime.h>
+
+#include "cuda_check_errors.hpp"
+
+namespace ReSolve
+{
+  namespace memory
+  {
+    /**
+     * @brief Class containing wrappers for CUDA API functions.
+     * 
+     * All wrappers are implemented as static functions returning integer
+     * error code from CUDA API functions.
+     * 
+     * @author Slaven Peles <peless@ornl.gov>
+     */
+    struct Cuda
+    {
+      static void deviceSynchronize()
+      {
+        cudaDeviceSynchronize();
+      }
+      
+      static int getLastDeviceError()
+      {
+        return static_cast<int>(cudaGetLastError());
+      }
+      
+      /** 
+       * @brief deletes variable from device
+       *
+       * @param v - a variable on the device
+       *
+       * @post v is freed from the device
+       */
+      static int deleteOnDevice(void* v)
+      {
+        return checkCudaErrors(cudaFree(v));
+      }
+
+      /**
+       * @brief allocates array v onto device
+       *
+       * @param v - pointer to the array to be allocated on the device
+       * @param n - number of array elements (int, size_t)
+       * 
+       * @tparam T - Array element type
+       * @tparam I - Array index type
+       *
+       * @post v is now a array with size n on the device
+       */
+      template <typename I, typename T>
+      static int allocateArrayOnDevice(T** v, I n)
+      {
+        return checkCudaErrors(cudaMalloc((void**) v, sizeof(T) * n));
+      }
+
+      /**
+       * @brief allocates buffer v onto device.
+       * 
+       * The difference from the array is that buffer size is required in bytes,
+       * not number of elements.
+       *
+       * @param v - pointer to the buffer to be allocated on the device
+       * @param n - size of the buffer in bytes
+       * 
+       * @tparam T - Buffer element data type type (typically void)
+       * @tparam I - Buffer size type (typically size_t)
+       *
+       * @post v is now a buffer of n bytes
+       */
+      template <typename I, typename T>
+      static int allocateBufferOnDevice(T** v, I n)
+      {
+        return checkCudaErrors(cudaMalloc((void**) v, n));
+      }
+
+      /**
+       * @brief Sets elements of device array v to zero
+       *
+       * @param v - pointer to the array to be allocated on the device
+       * @param n - number of the array elements to be set to zero
+       * 
+       * @tparam T - Array element type
+       * @tparam I - Array index type
+       *
+       * @post First n elements of array v are set to zero
+       */
+      template <typename I, typename T>
+      static int setZeroArrayOnDevice(T* v, I n)
+      {
+        return checkCudaErrors(cudaMemset(v, 0, sizeof(T) * n));
+      }
+
+      /** 
+       * @brief Copies array `src` from device to the array `dst` on the host.
+       *
+       * @param[in]    n - size of src array
+       * @param[in]  src - array on device
+       * @param[out] dst - array on host
+       *
+       * @pre `src` is a pointer to an allocated array on the device
+       * @pre `dst` is allocated to size >= n on the host
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToHost(T* dst, const T* src, I n)
+      {
+        return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToHost));
+      }
+
+      /**
+       * @brief Copies array `src` to the array `dst` on the device.
+       *
+       * @param n - size of src array
+       * @param src - array on device to be copied
+       * @param dst - array on device to be copied onto
+       *
+       * @pre `src` is a pointer to an allocated array on the device
+       * @pre `dst` is allocated to size >= n on the device
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayDeviceToDevice(T* dst, const T* src, I n)
+      {
+        return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToDevice));
+      }
+
+      /**
+       * @brief Copies array `src` from the host to the array `dst` on the device.
+       *
+       * @param n - size of src array
+       * @param src - array on the host to be copied
+       * @param dst - array on the device to be copied onto
+       *
+       * @pre `src` is a pointer to an allocated array on the host
+       * @pre `dst` is allocated to size >= n on the device
+       * @post Content of `dst` is overwritten by the content of `src`
+       */
+      template <typename I, typename T>
+      static int copyArrayHostToDevice(T* dst, const T* src, I n)
+      {
+        return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyHostToDevice));
+      }
+
+    };
+  }
+
+} //namespace ReSolve
diff --git a/resolve/cuda/MemoryUtils.cu b/resolve/cuda/MemoryUtils.cu
new file mode 100644
index 00000000..880bff1c
--- /dev/null
+++ b/resolve/cuda/MemoryUtils.cu
@@ -0,0 +1,40 @@
+/**
+ * @file MemoryUtils.cu
+ * 
+ * This file includes MemoryUtils.tpp and specifies what functions to
+ * instantiate from function templates.
+ * 
+ * @author Slaven Peles <peless@ornl.gov>
+ */
+
+
+#include <iostream>
+
+#include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
+
+#include <resolve/MemoryUtils.tpp>
+
+namespace ReSolve
+{
+  template void MemoryUtils<memory::Cuda>::deviceSynchronize();
+  template int MemoryUtils<memory::Cuda>::getLastDeviceError();
+  template int MemoryUtils<memory::Cuda>::deleteOnDevice(void*);
+
+  template int MemoryUtils<memory::Cuda>::allocateArrayOnDevice<index_type,  real_type>( real_type**, index_type);
+  template int MemoryUtils<memory::Cuda>::allocateArrayOnDevice<index_type, index_type>(index_type**, index_type);
+
+  template int MemoryUtils<memory::Cuda>::allocateBufferOnDevice<size_t, void>(void** v, size_t n);
+
+  template int MemoryUtils<memory::Cuda>::setZeroArrayOnDevice<index_type, real_type>( real_type*, index_type);
+
+  template int MemoryUtils<memory::Cuda>::copyArrayDeviceToHost<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Cuda>::copyArrayDeviceToHost<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int MemoryUtils<memory::Cuda>::copyArrayDeviceToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Cuda>::copyArrayDeviceToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+
+  template int MemoryUtils<memory::Cuda>::copyArrayHostToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
+  template int MemoryUtils<memory::Cuda>::copyArrayHostToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
+
+} //namespace ReSolve
diff --git a/resolve/cudaKernels.cu b/resolve/cuda/cudaKernels.cu
similarity index 100%
rename from resolve/cudaKernels.cu
rename to resolve/cuda/cudaKernels.cu
diff --git a/resolve/cudaKernels.h b/resolve/cuda/cudaKernels.h
similarity index 100%
rename from resolve/cudaKernels.h
rename to resolve/cuda/cudaKernels.h
diff --git a/resolve/cuda_check_errors.hpp b/resolve/cuda/cuda_check_errors.hpp
similarity index 66%
rename from resolve/cuda_check_errors.hpp
rename to resolve/cuda/cuda_check_errors.hpp
index 47ad0a1e..00a2029e 100644
--- a/resolve/cuda_check_errors.hpp
+++ b/resolve/cuda/cuda_check_errors.hpp
@@ -1,3 +1,12 @@
+/**
+ * @file cuda_check_errors.hpp
+ * 
+ * Contains macro to get error code from CUDA functions and to stream
+ * appropriate error output to Re::Solve's logger.
+ * 
+ * @author Kasia Swirydowicz <kasia.swirydowicz@pnnl.gov>
+ * @author Slaven Peles <peless@ornl.gov>
+ */
 #pragma once
 
 #include <resolve/utilities/logger/Logger.hpp>
diff --git a/resolve/hip/.gitkeep b/resolve/hip/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt
index 142be6e9..89ac925d 100644
--- a/resolve/matrix/CMakeLists.txt
+++ b/resolve/matrix/CMakeLists.txt
@@ -20,9 +20,13 @@ set(Matrix_HEADER_INSTALL
 
 # Build shared library ReSolve::matrix
 add_library(resolve_matrix SHARED ${Matrix_SRC})
-target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
 
-#install(TARGETS resolve_matrix EXPORT ReSolveTargets)
+if (RESOLVE_USE_CUDA)
+    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda)
+else()
+    target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu)
+endif()
+
 
 target_include_directories(resolve_matrix INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp
index 42454c34..c8caebf6 100644
--- a/resolve/matrix/Coo.cpp
+++ b/resolve/matrix/Coo.cpp
@@ -2,7 +2,6 @@
 #include <iostream>
 #include <iomanip> 
 
-#include <resolve/memoryUtils.hpp>
 #include "Coo.hpp"
 
 
@@ -102,13 +101,13 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
-        allocateArrayOnDevice(&d_row_data_, nnz_current);
+        mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
       }
       if (d_col_data_ == nullptr) {
-        allocateArrayOnDevice(&d_col_data_, nnz_current);
+        mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
       }
       if (d_val_data_ == nullptr) {
-        allocateArrayOnDevice(&d_val_data_, nnz_current);
+        mem_.allocateArrayOnDevice(&d_val_data_, nnz_current);
       }
     }
 
@@ -122,25 +121,25 @@ namespace ReSolve
         owns_cpu_vals_ = true;
         break;
       case 2://cuda->cpu
-        copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
-        copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
       case 1://cpu->cuda
-        copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
-        copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
-        copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
       case 3://cuda->cuda
-        copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
-        copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
-        copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
@@ -178,9 +177,9 @@ namespace ReSolve
     }
 
     if (memspace == "cuda") {
-      allocateArrayOnDevice(&d_row_data_, nnz_current); 
-      allocateArrayOnDevice(&d_col_data_, nnz_current); 
-      allocateArrayOnDevice(&d_val_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       owns_gpu_data_ = true;
       owns_gpu_vals_ = true;
       return 0;
@@ -206,9 +205,9 @@ namespace ReSolve
         if (h_val_data_ == nullptr) {
           h_val_data_ = new real_type[nnz_current];      
         }
-        copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
-        copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
@@ -219,17 +218,17 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_row_data_ == nullptr) {
-          allocateArrayOnDevice(&d_row_data_, nnz_current);
+          mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
         }
         if (d_col_data_ == nullptr) {
-          allocateArrayOnDevice(&d_col_data_, nnz_current);
+          mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
         }
         if (d_val_data_ == nullptr) {
-          allocateArrayOnDevice(&d_val_data_, nnz_current);
+          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current);
         }
-        copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
-        copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
-        copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp
index 9a40408b..1a305e03 100644
--- a/resolve/matrix/Csc.cpp
+++ b/resolve/matrix/Csc.cpp
@@ -1,6 +1,5 @@
 #include <cstring>  // <-- includes memcpy
 
-#include <resolve/memoryUtils.hpp>
 #include "Csc.hpp"
 
 namespace ReSolve 
@@ -98,13 +97,13 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       //check if cuda data allocated
       if (d_col_data_ == nullptr) {
-        allocateArrayOnDevice(&d_col_data_, n_ + 1); 
+        mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
       }
       if (d_row_data_ == nullptr) {
-        allocateArrayOnDevice(&d_row_data_, nnz_current);
+        mem_.allocateArrayOnDevice(&d_row_data_, nnz_current);
       }
       if (d_val_data_ == nullptr) {
-        allocateArrayOnDevice(&d_val_data_, nnz_current); 
+        mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       }
     }
 
@@ -118,25 +117,25 @@ namespace ReSolve
         owns_cpu_vals_ = true;
         break;
       case 2://cuda->cpu
-        copyArrayDeviceToHost(h_col_data_, col_data,      n_ + 1);
-        copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_col_data_, col_data,      n_ + 1);
+        mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
       case 1://cpu->cuda
-        copyArrayHostToDevice(d_col_data_, col_data,      n_ + 1);
-        copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
-        copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_col_data_, col_data,      n_ + 1);
+        mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
       case 3://cuda->cuda
-        copyArrayDeviceToDevice(d_col_data_, col_data,      n_ + 1);
-        copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
-        copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_col_data_, col_data,      n_ + 1);
+        mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
@@ -175,9 +174,9 @@ namespace ReSolve
     }
 
     if (memspace == "cuda") {
-      allocateArrayOnDevice(&d_col_data_,      n_ + 1); 
-      allocateArrayOnDevice(&d_row_data_, nnz_current); 
-      allocateArrayOnDevice(&d_val_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_col_data_,      n_ + 1); 
+      mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       owns_gpu_data_ = true;
       owns_gpu_vals_ = true;
       return 0;   
@@ -203,9 +202,9 @@ namespace ReSolve
         if (h_val_data_ == nullptr) {
           h_val_data_ = new real_type[nnz_current];      
         }
-        copyArrayDeviceToHost(h_col_data_, d_col_data_,      n_ + 1);
-        copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_,      n_ + 1);
+        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
@@ -216,17 +215,17 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_col_data_ == nullptr) {
-          allocateArrayOnDevice(&d_col_data_, n_ + 1); 
+          mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); 
         }
         if (d_row_data_ == nullptr) {
-          allocateArrayOnDevice(&d_row_data_, nnz_current); 
+          mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); 
         }
         if (d_val_data_ == nullptr) {
-          allocateArrayOnDevice(&d_val_data_, nnz_current); 
+          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
         }
-        copyArrayHostToDevice(d_col_data_, h_col_data_,      n_ + 1);
-        copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
-        copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_,      n_ + 1);
+        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp
index 9decfd6f..f1ddd31f 100644
--- a/resolve/matrix/Csr.cpp
+++ b/resolve/matrix/Csr.cpp
@@ -1,5 +1,5 @@
 #include <cstring>  // <-- includes memcpy
-#include <resolve/memoryUtils.hpp>
+
 #include "Csr.hpp"
 
 namespace ReSolve 
@@ -97,13 +97,13 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       //check if cuda data allocated
       if (d_row_data_ == nullptr) {
-        allocateArrayOnDevice(&d_row_data_, n_ + 1); 
+        mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
       }
       if (d_col_data_ == nullptr) {
-        allocateArrayOnDevice(&d_col_data_, nnz_current);
+        mem_.allocateArrayOnDevice(&d_col_data_, nnz_current);
       }
       if (d_val_data_ == nullptr) {
-        allocateArrayOnDevice(&d_val_data_, nnz_current); 
+        mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       }
     }
 
@@ -119,25 +119,25 @@ namespace ReSolve
         owns_cpu_vals_ = true;
         break;
       case 2://cuda->cpu
-        copyArrayDeviceToHost(h_row_data_, row_data,      n_ + 1);
-        copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_row_data_, row_data,      n_ + 1);
+        mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
         break;
       case 1://cpu->cuda
-        copyArrayHostToDevice(d_row_data_, row_data,      n_ + 1);
-        copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
-        copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_row_data_, row_data,      n_ + 1);
+        mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
         break;
       case 3://cuda->cuda
-        copyArrayDeviceToDevice(d_row_data_, row_data,      n_ + 1);
-        copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
-        copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_row_data_, row_data,      n_ + 1);
+        mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
@@ -175,9 +175,9 @@ namespace ReSolve
     }
 
     if (memspace == "cuda") {
-      allocateArrayOnDevice(&d_row_data_,      n_ + 1); 
-      allocateArrayOnDevice(&d_col_data_, nnz_current); 
-      allocateArrayOnDevice(&d_val_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_row_data_,      n_ + 1); 
+      mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
+      mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       owns_gpu_data_ = true;
       owns_gpu_vals_ = true;
       return 0;  
@@ -202,9 +202,9 @@ namespace ReSolve
         if (h_val_data_ == nullptr) {
           h_val_data_ = new real_type[nnz_current];      
         }
-        copyArrayDeviceToHost(h_row_data_, d_row_data_,      n_ + 1);
-        copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
-        copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_,      n_ + 1);
+        mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current);
         h_data_updated_ = true;
         owns_cpu_data_ = true;
         owns_cpu_vals_ = true;
@@ -215,17 +215,17 @@ namespace ReSolve
     if (memspaceOut == "cuda") {
       if ((d_data_updated_ == false) && (h_data_updated_ == true)) {
         if (d_row_data_ == nullptr) {
-          allocateArrayOnDevice(&d_row_data_, n_ + 1); 
+          mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); 
         }
         if (d_col_data_ == nullptr) {
-          allocateArrayOnDevice(&d_col_data_, nnz_current); 
+          mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); 
         }
         if (d_val_data_ == nullptr) {
-          allocateArrayOnDevice(&d_val_data_, nnz_current); 
+          mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
         }
-        copyArrayHostToDevice(d_row_data_, h_row_data_,      n_ + 1);
-        copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
-        copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_row_data_, h_row_data_,      n_ + 1);
+        mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current);
         d_data_updated_ = true;
         owns_gpu_data_ = true;
         owns_gpu_vals_ = true;
diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp
index d240c77f..19119731 100644
--- a/resolve/matrix/MatrixHandler.cpp
+++ b/resolve/matrix/MatrixHandler.cpp
@@ -1,7 +1,6 @@
 #include <algorithm>
 
 #include <resolve/utilities/logger/Logger.hpp>
-#include <resolve/memoryUtils.hpp>
 #include <resolve/LinAlgWorkspace.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/matrix/Coo.hpp>
@@ -290,8 +289,8 @@ namespace ReSolve {
                                            CUSPARSE_SPMV_CSR_ALG2, 
                                            &bufferSize);
           error_sum += status;
-          deviceSynchronize();
-          allocateBufferOnDevice(&buffer_spmv, bufferSize);
+          mem_.deviceSynchronize();
+          mem_.allocateBufferOnDevice(&buffer_spmv, bufferSize);
           workspaceCUDA->setSpmvMatrixDescriptor(matA);
           workspaceCUDA->setSpmvBuffer(buffer_spmv);
 
@@ -309,10 +308,10 @@ namespace ReSolve {
                               CUSPARSE_SPMV_CSR_ALG2, 
                               buffer_spmv);
         error_sum += status;
-        deviceSynchronize();
+        mem_.deviceSynchronize();
         if (status)
           out::error() << "Matvec status: " << status 
-                       << "Last error code: " << getLastDeviceError() << std::endl;
+                       << "Last error code: " << mem_.getLastDeviceError() << std::endl;
         vec_result->setDataUpdated("cuda");
 
         cusparseDestroyDnVec(vecx);
@@ -388,7 +387,7 @@ namespace ReSolve {
                                                               CUSPARSE_CSR2CSC_ALG1, 
                                                               &bufferSize);
       error_sum += status;
-      allocateBufferOnDevice(&d_work, bufferSize);
+      mem_.allocateBufferOnDevice(&d_work, bufferSize);
       status = cusparseCsr2cscEx2(workspaceCUDA->getCusparseHandle(),
                                   n, 
                                   m, 
@@ -406,7 +405,7 @@ namespace ReSolve {
                                   d_work);
       error_sum += status;
       return error_sum;
-      deleteOnDevice(d_work);
+      mem_.deleteOnDevice(d_work);
     } else { 
       out::error() << "Not implemented (yet)" << std::endl;
       return -1;
diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp
index fe061f64..366b5422 100644
--- a/resolve/matrix/MatrixHandler.hpp
+++ b/resolve/matrix/MatrixHandler.hpp
@@ -5,6 +5,7 @@
 // (3) Matrix 1-norm
 #pragma once
 #include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve
 { 
@@ -75,6 +76,8 @@ namespace ReSolve {
       LinAlgWorkspace* workspace_{nullptr};
       bool new_matrix_{true};     ///< if the structure changed, you need a new handler.
       bool values_changed_{true}; ///< needed for matvec
+
+      MemoryHandler mem_; ///< Device memory manager object
   };
 
 } // namespace ReSolve
diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp
index d27cd7b1..5c866386 100644
--- a/resolve/matrix/Sparse.cpp
+++ b/resolve/matrix/Sparse.cpp
@@ -1,6 +1,6 @@
 #include <cstring>  // <-- includes memcpy
+
 #include "Sparse.hpp"
-#include <resolve/memoryUtils.hpp>
 
 namespace ReSolve { namespace matrix {
 
@@ -186,11 +186,11 @@ namespace ReSolve { namespace matrix {
     } else {
       if (memspace == "cuda"){ 
         if (owns_gpu_data_) {
-          deleteOnDevice(d_row_data_);
-          deleteOnDevice(d_col_data_);
+          mem_.deleteOnDevice(d_row_data_);
+          mem_.deleteOnDevice(d_col_data_);
         }
         if (owns_gpu_vals_) {
-          deleteOnDevice(d_val_data_);
+          mem_.deleteOnDevice(d_val_data_);
         }
       } else {
         return -1;
@@ -222,7 +222,7 @@ namespace ReSolve { namespace matrix {
     if (memspaceOut == "cuda") {
       //check if cuda data allocated
       if (d_val_data_ == nullptr) {
-        allocateArrayOnDevice(&d_val_data_, nnz_current); 
+        mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); 
       }
     }
 
@@ -233,17 +233,17 @@ namespace ReSolve { namespace matrix {
         owns_cpu_vals_ = true;
         break;
       case 2://cuda->cpu
-        copyArrayDeviceToHost(h_val_data_, new_vals, nnz_current);
+        mem_.copyArrayDeviceToHost(h_val_data_, new_vals, nnz_current);
         h_data_updated_ = true;
         owns_cpu_vals_ = true;
         break;
       case 1://cpu->cuda
-        copyArrayHostToDevice(d_val_data_, new_vals, nnz_current);
+        mem_.copyArrayHostToDevice(d_val_data_, new_vals, nnz_current);
         d_data_updated_ = true;
         owns_gpu_vals_ = true;
         break;
       case 3://cuda->cuda
-        copyArrayDeviceToDevice(d_val_data_, new_vals, nnz_current);
+        mem_.copyArrayDeviceToDevice(d_val_data_, new_vals, nnz_current);
         d_data_updated_ = true;
         owns_gpu_vals_ = true;
         break;
diff --git a/resolve/matrix/Sparse.hpp b/resolve/matrix/Sparse.hpp
index c44eb03c..1196c38e 100644
--- a/resolve/matrix/Sparse.hpp
+++ b/resolve/matrix/Sparse.hpp
@@ -3,6 +3,7 @@
 #pragma once
 #include <string>
 #include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve { namespace matrix {
   class Sparse 
@@ -86,5 +87,8 @@ namespace ReSolve { namespace matrix {
 
       bool owns_gpu_data_{false}; ///< for row/col data
       bool owns_gpu_vals_{false}; ///< for values
+
+      MemoryHandler mem_; ///< Device memory manager object
+
   };
 }} // namespace ReSolve::matrix
diff --git a/resolve/memoryUtils.cu b/resolve/memoryUtils.cu
deleted file mode 100644
index f36668c3..00000000
--- a/resolve/memoryUtils.cu
+++ /dev/null
@@ -1,146 +0,0 @@
-#include <iostream>
-
-#include <resolve/Common.hpp>
-#include <resolve/memoryUtils.hpp>
-#include "cuda_check_errors.hpp"
-
-namespace ReSolve
-{
-  void deviceSynchronize()
-  {
-    cudaDeviceSynchronize();
-  }
-  
-  int getLastDeviceError()
-  {
-    return static_cast<int>(cudaGetLastError());
-  }
-  
-  /** 
-   * @brief deletes variable from device
-   *
-   * @param v - a variable on the device
-   *
-   * @post v is freed from the device
-   */
-  int deleteOnDevice(void* v)
-  {
-    return checkCudaErrors(cudaFree(v));
-  }
-
-  /**
-   * @brief allocates array v onto device
-   *
-   * @param v - pointer to the array to be allocated on the device
-   * @param n - number of array elements (int, size_t)
-   * 
-   * @tparam T - Array element type
-   * @tparam I - Array index type
-   *
-   * @post v is now a array with size n on the device
-   */
-  template <typename I, typename T>
-  int allocateArrayOnDevice(T** v, I n)
-  {
-    return checkCudaErrors(cudaMalloc((void**) v, sizeof(T) * n));
-  }
-  template int allocateArrayOnDevice<index_type,  real_type>( real_type**, index_type);
-  template int allocateArrayOnDevice<index_type, index_type>(index_type**, index_type);
-
-  /**
-   * @brief allocates buffer v onto device.
-   * 
-   * The difference from the array is that buffer size is required in bytes,
-   * not number of elements.
-   *
-   * @param v - pointer to the buffer to be allocated on the device
-   * @param n - size of the buffer in bytes
-   * 
-   * @tparam T - Buffer element data type type (typically void)
-   * @tparam I - Buffer size type (typically size_t)
-   *
-   * @post v is now a buffer of n bytes
-   */
-  template <typename I, typename T>
-  int allocateBufferOnDevice(T** v, I n)
-  {
-    return checkCudaErrors(cudaMalloc((void**) v, n));
-  }
-  template int allocateBufferOnDevice<size_t, void>(void** v, size_t n);
-
-  /**
-   * @brief Sets elements of device array v to zero
-   *
-   * @param v - pointer to the array to be allocated on the device
-   * @param n - number of the array elements to be set to zero
-   * 
-   * @tparam T - Array element type
-   * @tparam I - Array index type
-   *
-   * @post First n elements of array v are set to zero
-   */
-  template <typename I, typename T>
-  int setZeroArrayOnDevice(T* v, I n)
-  {
-    return checkCudaErrors(cudaMemset(v, 0, sizeof(T) * n));
-  }
-  template int setZeroArrayOnDevice<index_type,  real_type>( real_type*, index_type);
-
-  /** 
-   * @brief Copies array `src` from device to the array `dst` on the host.
-   *
-   * @param[in]    n - size of src array
-   * @param[in]  src - array on device
-   * @param[out] dst - array on host
-   *
-   * @pre `src` is a pointer to an allocated array on the device
-   * @pre `dst` is allocated to size >= n on the host
-   * @post Content of `dst` is overwritten by the content of `src`
-   */
-  template <typename I, typename T>
-  int copyArrayDeviceToHost(T* dst, const T* src, I n)
-  {
-    return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToHost));
-  }
-  template int copyArrayDeviceToHost<index_type,  real_type>( real_type*, const  real_type*, index_type);
-  template int copyArrayDeviceToHost<index_type, index_type>(index_type*, const index_type*, index_type);
-
-  /**
-   * @brief Copies array `src` to the array `dst` on the device.
-   *
-   * @param n - size of src array
-   * @param src - array on device to be copied
-   * @param dst - array on device to be copied onto
-   *
-   * @pre `src` is a pointer to an allocated array on the device
-   * @pre `dst` is allocated to size >= n on the device
-   * @post Content of `dst` is overwritten by the content of `src`
-   */
-  template <typename I, typename T>
-  int copyArrayDeviceToDevice(T* dst, const T* src, I n)
-  {
-    return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyDeviceToDevice));
-  }
-  template int copyArrayDeviceToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
-  template int copyArrayDeviceToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
-
-  /**
-   * @brief Copies array `src` from the host to the array `dst` on the device.
-   *
-   * @param n - size of src array
-   * @param src - array on the host to be copied
-   * @param dst - array on the device to be copied onto
-   *
-   * @pre `src` is a pointer to an allocated array on the host
-   * @pre `dst` is allocated to size >= n on the device
-   * @post Content of `dst` is overwritten by the content of `src`
-   */
-  template <typename I, typename T>
-  int copyArrayHostToDevice(T* dst, const T* src, I n)
-  {
-    return checkCudaErrors(cudaMemcpy(dst, src, sizeof(T) * n, cudaMemcpyHostToDevice));
-  }
-  template int copyArrayHostToDevice<index_type,  real_type>( real_type*, const  real_type*, index_type);
-  template int copyArrayHostToDevice<index_type, index_type>(index_type*, const index_type*, index_type);
-
-} //namespace ReSolve
diff --git a/resolve/memoryUtils.hpp b/resolve/memoryUtils.hpp
deleted file mode 100644
index 0dcab1ba..00000000
--- a/resolve/memoryUtils.hpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#pragma once
-
-#include <cstring>
-#include <iostream>
-
-namespace ReSolve
-{
-  void deviceSynchronize();
-
-  int getLastDeviceError();
-
-
-  int deleteOnDevice(void* v);
-
-  template <typename I, typename T>
-  int allocateArrayOnDevice(T** v, I n);
-
-  template <typename I, typename T>
-  int allocateBufferOnDevice(T** v, I n);
-
-  template <typename I, typename T>
-  int setZeroArrayOnDevice(T* v, I n);
-
-  template <typename I, typename T>
-  int copyArrayDeviceToHost(T* dst, const T* src, I n);
-
-  template <typename I, typename T>
-  int copyArrayDeviceToDevice(T* dst, const T* src, I n);
-
-  template <typename I, typename T>
-  int copyArrayHostToDevice(T* dst, const T* src, I n);
-
-  /**
-   * @brief copies a host array onto a newly allocated array on the device
-   *
-   * @param n - size of src array
-   * @param src - array on host to be cloned
-   * @param dst - array on device on which src is cloned
-   *
-   * @post dst is a clone of src on the device
-   */
-  template <typename I, typename T>
-  int cloneArrayHostToDevice(I n, T** src, T** dst)
-  {
-    allocateArrayOnDevice(n, dst);
-    copyArrayToDevice(n, *src, *dst);
-    return 0;
-  }
-
-  /**
-   * @brief prints array from host
-   *
-   * @param v - array on host
-   * @param display_n - number of elements to print
-   * @param label - name of array
-   *
-   * @pre display_n <= number of elements in v
-   * @post display_n elements of v printed
-   */
-  template <typename I, typename T>
-  void displayHostarray(T* v, 
-                        I start_i,
-                        I display_n, 
-                        std::string label = "array")
-  {
-    std::cout << "\n\n" << label << ": {";
-    for(int i = start_i; i < start_i + display_n - 1; i++){
-      std::cout << v[i] << ", ";
-    }
-    std::cout << v[display_n - 1] << "}\n" << std::endl; 
-  }
-
-  /**
-   * @brief prints array from device
-   *
-   * @param v - array on host
-   * @param display_n - number of elements to print
-   * @param n - number of elements in v
-   * @param label - name of array
-   *
-   * @pre display_n <= n
-   * @post display_n elements of v printed
-   */
-  template <typename I, typename T>
-  int displayDevicearray(T* v, 
-                         I n, 
-                         I start_i,
-                         I display_n, 
-                         std::string label = "array")
-  {
-    T* h_v = new T[n];
-    copyArrayDeviceToHost(n, v, h_v);
-    displayHostArray(h_v, start_i, display_n, label);
-    return 0;
-  }
-
-  /**
-   * @brief clones array of size n from src to dst
-   *
-   * @param n - size of array
-   * @param src - array to be cloned
-   * @param dst - clone target
-   *
-   * @pre n contain an int length
-   * @pre src is a valid array
-   *
-   * @post dst is a clone of src on device
-   */
-  template <typename I, typename T>
-  int cloneArrayDeviceToDevice(int n, T** src, T** dst)
-  {
-    allocateArrayOnDevice(n, dst);
-    copyArrayDeviceToDevice(n, *src, *dst);
-    return 0;
-  }
-
-} // namespace ReSolve
diff --git a/resolve/resolve_defs.hpp.in b/resolve/resolve_defs.hpp.in
new file mode 100644
index 00000000..9756376c
--- /dev/null
+++ b/resolve/resolve_defs.hpp.in
@@ -0,0 +1,16 @@
+#pragma once
+
+#cmakedefine RESOLVE_USE_GPU
+#cmakedefine RESOLVE_USE_CUDA
+#cmakedefine RESOLVE_USE_HIP
+#cmakedefine RESOLVE_USE_MPI
+#cmakedefine RESOLVE_USE_RAJA
+#cmakedefine RESOLVE_USE_EIGEN
+#cmakedefine RESOLVE_USE_KLU
+#define RESOLVE_VERSION  "@PROJECT_VERSION@"
+// #define RESOLVE_VERSION_MAJOR @PROJECT_VERSION_MAJOR@
+// #define RESOLVE_VERSION_MINOR @PROJECT_VERSION_MINOR@
+// #define RESOLVE_VERSION_PATCH @PROJECT_VERSION_PATCH@
+
+// /// Date of build with the format "%Y-%m-%d"
+// #define RESOLVE_RELEASE_DATE "@RESOLVE_RELEASE_DATE@"
diff --git a/resolve/utilities/logger/CMakeLists.txt b/resolve/utilities/logger/CMakeLists.txt
index cb1a9d89..91b29dfc 100644
--- a/resolve/utilities/logger/CMakeLists.txt
+++ b/resolve/utilities/logger/CMakeLists.txt
@@ -22,4 +22,4 @@ target_include_directories(resolve_logger INTERFACE
     $<INSTALL_INTERFACE:include>
 )
 
-install(FILES ${Matrix_HEADER_INSTALL} DESTINATION include/resolve/utilities/logger)
+install(FILES ${Logger_HEADER_INSTALL} DESTINATION include/resolve/utilities/logger)
diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 37e5564b..57feb5df 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -15,13 +15,17 @@ set(Vector_HEADER_INSTALL
     VectorHandler.hpp
 )
 
-set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA)
-
-# Build shared library ReSolve::vector
-add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA})
-target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
-
-#install(TARGETS resolve_vector EXPORT ReSolveTargets)
+if (RESOLVE_USE_CUDA)
+    set_source_files_properties(${Vector_SRC_CUDA} PROPERTIES LANGUAGE CUDA)
+
+    # Build shared library ReSolve::vector
+    add_library(resolve_vector SHARED ${Vector_SRC} ${Vector_SRC_CUDA})
+    target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda)
+else()
+    # Build shared library ReSolve::vector
+    add_library(resolve_vector SHARED ${Vector_SRC})
+    target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu)
+endif()
 
 target_include_directories(resolve_vector INTERFACE
     $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}>
diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp
index e612794c..7934e8b0 100644
--- a/resolve/vector/Vector.cpp
+++ b/resolve/vector/Vector.cpp
@@ -1,5 +1,4 @@
 #include <cstring>
-#include <resolve/memoryUtils.hpp>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/vector/VectorKernels.hpp>
 
@@ -34,7 +33,7 @@ namespace ReSolve { namespace vector {
   Vector::~Vector()
   {
     if (owns_cpu_data_) delete [] h_data_;
-    if (owns_gpu_data_) deleteOnDevice(d_data_);
+    if (owns_gpu_data_) mem_.deleteOnDevice(d_data_);
   }
 
 
@@ -100,7 +99,7 @@ namespace ReSolve { namespace vector {
     }
     if ((memspaceOut == "cuda") && (d_data_ == nullptr)){
       //allocate first
-      allocateArrayOnDevice(&d_data_, n_ * k_);
+      mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
 
     switch(control)  {
@@ -111,19 +110,19 @@ namespace ReSolve { namespace vector {
         gpu_updated_ = false;
         break;
       case 2: //cuda->cpu
-        copyArrayDeviceToHost(h_data_, data, n_current_ * k_);
+        mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         cpu_updated_ = true;
         gpu_updated_ = false;
         break;
       case 1: //cpu->cuda
-        copyArrayHostToDevice(d_data_, data, n_current_ * k_);
+        mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         gpu_updated_ = true;
         cpu_updated_ = false;
         break;
       case 3: //cuda->cuda
-        copyArrayDeviceToDevice(d_data_, data, n_current_ * k_);
+        mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_);
         owns_gpu_data_ = true;
         gpu_updated_ = true;
         cpu_updated_ = false;
@@ -174,16 +173,16 @@ namespace ReSolve { namespace vector {
     }
     if ((memspaceOut == "cuda") && (d_data_ == nullptr)){
       //allocate first
-      allocateArrayOnDevice(&d_data_, n_ * k_);
+      mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
     } 
 
     switch(control)  {
       case 0: //cpu->cuda
-        copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_);
+        mem_.copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_);
         owns_gpu_data_ = true;
         break;
       case 1: //cuda->cpu
-        copyArrayDeviceToHost(h_data_, d_data_, n_current_ * k_);
+        mem_.copyArrayDeviceToHost(h_data_, d_data_, n_current_ * k_);
         owns_cpu_data_ = true;
         break;
       default:
@@ -202,8 +201,8 @@ namespace ReSolve { namespace vector {
       owns_cpu_data_ = true;
     } else {
       if (memspace == "cuda") {
-        deleteOnDevice(d_data_);
-        allocateArrayOnDevice(&d_data_, n_ * k_);
+        mem_.deleteOnDevice(d_data_);
+        mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
         owns_gpu_data_ = true;
       }
     }
@@ -223,10 +222,10 @@ namespace ReSolve { namespace vector {
     } else {
       if (memspace == "cuda") {
         if (d_data_ == nullptr) {
-          allocateArrayOnDevice(&d_data_, n_ * k_);
+          mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
-        setZeroArrayOnDevice(d_data_, n_ * k_);
+        mem_.setZeroArrayOnDevice(d_data_, n_ * k_);
       }
     }
   }
@@ -244,11 +243,11 @@ namespace ReSolve { namespace vector {
     } else {
       if (memspace == "cuda") {
         if (d_data_ == nullptr) {
-          allocateArrayOnDevice(&d_data_, n_ * k_);
+          mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         // TODO: We should not need to access raw data in this class
-        setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_);
+        mem_.setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_);
       }
     }
   }
@@ -266,7 +265,7 @@ namespace ReSolve { namespace vector {
     } else {
       if (memspace == "cuda") {
         if (d_data_ == nullptr) {
-          allocateArrayOnDevice(&d_data_, n_ * k_);
+          mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         set_array_const(n_ * k_, C, d_data_);
@@ -287,7 +286,7 @@ namespace ReSolve { namespace vector {
     } else {
       if (memspace == "cuda") {
         if (d_data_ == nullptr) {
-          allocateArrayOnDevice(&d_data_, n_ * k_);
+          mem_.allocateArrayOnDevice(&d_data_, n_ * k_);
           owns_gpu_data_ = true;
         }
         set_array_const(n_current_ * 1, C, &d_data_[n_current_ * j]);
@@ -324,7 +323,7 @@ namespace ReSolve { namespace vector {
         std::memcpy(dest, data, n_current_ * sizeof(real_type));
       } else {
         if (memspaceOut == "cuda") { 
-          copyArrayDeviceToDevice(dest, data, n_current_);
+          mem_.copyArrayDeviceToDevice(dest, data, n_current_);
         } else {
           //error
         } 
@@ -340,7 +339,7 @@ namespace ReSolve { namespace vector {
       std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type));
     } else {
       if (memspaceOut == "cuda") { 
-        copyArrayDeviceToDevice(dest, data, n_current_ * k_);
+        mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_);
       } else {
         //error
       } 
diff --git a/resolve/vector/Vector.hpp b/resolve/vector/Vector.hpp
index c4d9ad80..9d1bd452 100644
--- a/resolve/vector/Vector.hpp
+++ b/resolve/vector/Vector.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <string>
 #include <resolve/Common.hpp>
+#include <resolve/MemoryUtils.hpp>
 
 namespace ReSolve { namespace vector {
   class Vector 
@@ -42,5 +43,7 @@ namespace ReSolve { namespace vector {
 
       bool owns_gpu_data_{false};
       bool owns_cpu_data_{false};
+
+      MemoryHandler mem_; ///< Device memory manager object
   };
 }} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index f695f533..032cc9b1 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -2,7 +2,7 @@
 
 #include <resolve/utilities/logger/Logger.hpp>
 #include <resolve/LinAlgWorkspace.hpp>
-#include <resolve/cudaKernels.h>
+#include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
 #include "VectorHandler.hpp"
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 62841b10..ae24f9db 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -5,6 +5,10 @@
 @author Slaven Peles <peless@ornl.gov>
 
 ]]
-add_subdirectory(functionality)
+
+if(RESOLVE_USE_CUDA)
+  add_subdirectory(functionality) 
+endif(RESOLVE_USE_CUDA)
+
 add_subdirectory(unit)
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index f91c2ff7..d61250fe 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -6,6 +6,8 @@
 
 ]]
 
-add_subdirectory(matrix)
-add_subdirectory(vector)
-add_subdirectory(utilities)
+if(RESOLVE_USE_CUDA)
+  add_subdirectory(matrix)
+  add_subdirectory(vector)
+  add_subdirectory(utilities)
+endif(RESOLVE_USE_CUDA)
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 2ed889db..6f0d305b 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -1,6 +1,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include <iomanip>
 #include <resolve/GramSchmidt.hpp>
 #include <resolve/matrix/MatrixHandler.hpp>
 #include <resolve/vector/Vector.hpp>
@@ -145,8 +146,9 @@ namespace ReSolve {
                           << " Inner product computed: " << ip << ", expected: " << 0.0 << "\n";
                 break; 
               }
-              if ( (i == j) && (abs(sqrt(ip)) != 1.0)) {
+              if ( (i == j) && !isEqual(abs(sqrt(ip)), 1.0)) {
                 status = false;
+                std::cout << std::setprecision(16);
                 std::cout << "Vector " << i << " has norm: " << sqrt(ip) << " expected: "<< 1.0 <<"\n";
                 break; 
               }