diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt
index 8fa9cd59..f418bf55 100644
--- a/resolve/vector/CMakeLists.txt
+++ b/resolve/vector/CMakeLists.txt
@@ -10,6 +10,8 @@
 set(Vector_SRC 
     Vector.cpp
     VectorHandler.cpp
+    VectorHandlerCpu.cpp
+    VectorHandlerCuda.cpp
 )
 
 
diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp
index 71bfce9c..9648058e 100644
--- a/resolve/vector/VectorHandler.cpp
+++ b/resolve/vector/VectorHandler.cpp
@@ -4,6 +4,9 @@
 #include <resolve/cuda/cudaKernels.h>
 #include <resolve/vector/Vector.hpp>
 #include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include <resolve/vector/VectorHandlerCpu.hpp>
+#include <resolve/vector/VectorHandlerCuda.hpp>
 #include "VectorHandler.hpp"
 
 namespace ReSolve {
@@ -14,6 +17,8 @@ namespace ReSolve {
    */
   VectorHandler::VectorHandler()
   {
+    cpuImpl_ = new VectorHandlerCpu();
+    isCpuEnabled_ = true;
   }
 
   /** 
@@ -21,9 +26,24 @@ namespace ReSolve {
    * 
    * @param new_workspace - workspace to be set     
    */
-  VectorHandler:: VectorHandler(LinAlgWorkspace* new_workspace)
+  VectorHandler::VectorHandler(LinAlgWorkspace* new_workspace)
   {
-    workspace_ = new_workspace;
+    cpuImpl_ = new VectorHandlerCpu(new_workspace);
+    isCpuEnabled_ = true;
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandler::VectorHandler(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    cudaImpl_ = new VectorHandlerCuda(new_workspace);
+    cpuImpl_  = new  VectorHandlerCpu();
+
+    isCudaEnabled_ = true;
+    isCpuEnabled_  = true;
   }
 
   /** 
@@ -46,28 +66,11 @@ namespace ReSolve {
 
   real_type VectorHandler::dot(vector::Vector* x, vector::Vector* y, std::string memspace)
   { 
-    if (memspace == "cuda" ){ 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      double nrm = 0.0;
-      cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
-      if (st!=0) {printf("dot product crashed with code %d \n", st);}
-      return nrm;
+    if (memspace == "cuda" ) {
+      return cudaImpl_->dot(x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        real_type sum = 0.0;
-        real_type c = 0.0;
-        real_type t, y;
-        for (int i = 0; i < x->getSize(); ++i){
-          y = (x_data[i] * y_data[i]) - c;
-          t = sum + y;
-          c = (t - sum) - y;
-          sum = t;        
-          //   sum += (x_data[i] * y_data[i]);
-        } 
-        return sum;
+        return cpuImpl_->dot(x, y);
       } else {
         out::error() << "Not implemented (yet)" << std::endl;
         return NAN;
@@ -85,20 +88,11 @@ namespace ReSolve {
    */
   void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace)
   {
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
-      if (st!=0) {
-        ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
-      }
+    if (memspace == "cuda" ) {
+      cudaImpl_->scal(alpha, x);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-
-        for (int i = 0; i < x->getSize(); ++i){
-          x_data[i] *= (*alpha);
-        }
+        cpuImpl_->scal(alpha, x);
       } else {      
         out::error() << "Not implemented (yet)" << std::endl;
       }  
@@ -114,26 +108,14 @@ namespace ReSolve {
    * @param[in]  memspace String containg memspace (cpu or cuda)
    * 
    */
-  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace )
+  void VectorHandler::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace)
   {
     //AXPY:  y = alpha * x + y
-    if (memspace == "cuda" ) { 
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      cublasDaxpy(handle_cublas,
-                  x->getSize(),
-                  alpha,
-                  x->getData("cuda"),
-                  1,
-                  y->getData("cuda"),
-                  1);
+    if (memspace == "cuda" ) {
+      cudaImpl_->axpy(alpha, x, y);
     } else {
       if (memspace == "cpu") {
-        real_type* x_data = x->getData("cpu");
-        real_type* y_data = y->getData("cpu");
-        for (int i = 0; i < x->getSize(); ++i){
-          y_data[i] = (*alpha) * x_data[i] + y_data[i];
-        }
+        cpuImpl_->axpy(alpha, x, y);
       } else {
         out::error() <<"Not implemented (yet)" << std::endl;
       }
@@ -160,38 +142,9 @@ namespace ReSolve {
   void VectorHandler::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace)
   {
     if (memspace == "cuda") {
-      LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-      if (transpose == "T") {
-
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_T,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-
-      } else {
-        cublasDgemv(handle_cublas,
-                    CUBLAS_OP_N,
-                    n,
-                    k,
-                    alpha,
-                    V->getData("cuda"),
-                    n,
-                    y->getData("cuda"),
-                    1,
-                    beta,
-                    x->getData("cuda"),
-                    1);
-      }
-
+      cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
+    } else if (memspace == "cpu") {
+      cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -213,26 +166,9 @@ namespace ReSolve {
   {
     using namespace constants;
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_N,
-                    CUBLAS_OP_N,
-                    size,       // m
-                    1,          // n
-                    k + 1,      // k
-                    &MINUSONE, // alpha
-                    x->getData("cuda"), // A
-                    size,       // lda
-                    alpha->getData("cuda"), // B
-                    k + 1,      // ldb
-                    &ONE,
-                    y->getData("cuda"),          // c
-                    size);      // ldc     
-      }
+      cudaImpl_->massAxpy(size, alpha, k, x, y);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massAxpy(size, alpha, k, x, y);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
@@ -254,29 +190,10 @@ namespace ReSolve {
    */
   void VectorHandler::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res, std::string memspace)
   {
-    using namespace constants;
-
     if (memspace == "cuda") {
-      if (k < 200) {
-        mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
-      } else {
-        LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_;
-        cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
-        cublasDgemm(handle_cublas,
-                    CUBLAS_OP_T,
-                    CUBLAS_OP_N,
-                    k + 1,   //m
-                    2,       //n
-                    size,    //k
-                    &ONE,   //alpha
-                    V->getData("cuda"),       //A
-                    size,    //lda
-                    x->getData("cuda"),       //B
-                    size,    //ldb
-                    &ZERO,
-                    res->getData("cuda"),     //c
-                    k + 1);  //ldc 
-      }
+      cudaImpl_->massDot2Vec(size, V, k, x, res);
+    } else if (memspace == "cpu") {
+      cpuImpl_->massDot2Vec(size, V, k, x, res);
     } else {
       out::error() << "Not implemented (yet)" << std::endl;
     }
diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp
index bf2d37d7..53ec0bca 100644
--- a/resolve/vector/VectorHandler.hpp
+++ b/resolve/vector/VectorHandler.hpp
@@ -7,7 +7,9 @@ namespace ReSolve
   {
     class Vector;
   }
+  class VectorHandlerImpl;
   class LinAlgWorkspace;
+  class LinAlgWorkspaceCUDA;
 }
 
 
@@ -16,13 +18,14 @@ namespace ReSolve { //namespace vector {
     public:
       VectorHandler();
       VectorHandler(LinAlgWorkspace* new_workspace);
+      VectorHandler(LinAlgWorkspaceCUDA* new_workspace);
       ~VectorHandler();
 
       //y = alpha x + y
-      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace );
+      void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //dot: x \cdot y
-      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace );
+      real_type dot(vector::Vector* x, vector::Vector* y, std::string memspace);
 
       //scal = alpha * x
       void scal(const real_type* alpha, vector::Vector* x, std::string memspace);
@@ -40,9 +43,21 @@ namespace ReSolve { //namespace vector {
        * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
        * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
        */ 
-      void gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x, std::string memspace);
+      void gemv(std::string transpose,
+                index_type n,
+                index_type k,
+                const real_type* alpha,
+                const real_type* beta,
+                vector::Vector* V,
+                vector::Vector* y,
+                vector::Vector* x,
+                std::string memspace);
     private:
-      LinAlgWorkspace* workspace_;
+      VectorHandlerImpl*  cpuImpl_{nullptr};
+      VectorHandlerImpl* cudaImpl_{nullptr};
+
+      bool isCpuEnabled_{false};
+      bool isCudaEnabled_{false};
   };
 
 } //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp
new file mode 100644
index 00000000..593ba5b0
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.cpp
@@ -0,0 +1,159 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCpu.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCpu::VectorHandlerCpu()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCpu:: VectorHandlerCpu(LinAlgWorkspace* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCpu::~VectorHandlerCpu()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    real_type sum = 0.0;
+    real_type c = 0.0;
+    // real_type t, y;
+    for (int i = 0; i < x->getSize(); ++i) {
+      real_type y = (x_data[i] * y_data[i]) - c;
+      real_type t = sum + y;
+      c = (t - sum) - y;
+      sum = t;        
+      //   sum += (x_data[i] * y_data[i]);
+    } 
+    return sum;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x)
+  {
+    real_type* x_data = x->getData("cpu");
+
+    for (int i = 0; i < x->getSize(); ++i){
+      x_data[i] *= (*alpha);
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCpu::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    real_type* x_data = x->getData("cpu");
+    real_type* y_data = y->getData("cpu");
+    for (int i = 0; i < x->getSize(); ++i) {
+      y_data[i] = (*alpha) * x_data[i] + y_data[i];
+    }
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCpu::gemv(std::string transpose, index_type n, index_type k, const real_type* alpha, const real_type* beta, vector::Vector* V, vector::Vector* y, vector::Vector* x)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCpu::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector (a dense multivector consisting of k vectors size n)  
+   *        and x is k x 2 dense multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCpu::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    out::error() << "Not implemented (yet)" << std::endl;
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCpu.hpp b/resolve/vector/VectorHandlerCpu.hpp
new file mode 100644
index 00000000..a5f09162
--- /dev/null
+++ b/resolve/vector/VectorHandlerCpu.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspace;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCpu : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCpu();
+      VectorHandlerCpu(LinAlgWorkspace* workspace);
+      virtual ~VectorHandlerCpu();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspace* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp
new file mode 100644
index 00000000..69044f37
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.cpp
@@ -0,0 +1,236 @@
+#include <iostream>
+
+#include <resolve/utilities/logger/Logger.hpp>
+#include <resolve/cuda/cudaKernels.h>
+#include <resolve/vector/Vector.hpp>
+#include <resolve/workspace/LinAlgWorkspaceFactory.hpp>
+#include <resolve/vector/VectorHandlerImpl.hpp>
+#include "VectorHandlerCuda.hpp"
+
+namespace ReSolve {
+  using out = io::Logger;
+
+  /** 
+   * @brief empty constructor that does absolutely nothing        
+   */
+  VectorHandlerCuda::VectorHandlerCuda()
+  {
+  }
+
+  /** 
+   * @brief constructor
+   * 
+   * @param new_workspace - workspace to be set     
+   */
+  VectorHandlerCuda:: VectorHandlerCuda(LinAlgWorkspaceCUDA* new_workspace)
+  {
+    workspace_ = new_workspace;
+  }
+
+  /** 
+   * @brief destructor     
+   */
+  VectorHandlerCuda::~VectorHandlerCuda()
+  {
+    //delete the workspace TODO
+  }
+
+  /** 
+   * @brief dot product of two vectors i.e, a = x^Ty
+   * 
+   * @param[in] x The first vector
+   * @param[in] y The second vector
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   * 
+   * @return dot product (real number) of _x_ and _y_
+   */
+
+  real_type VectorHandlerCuda::dot(vector::Vector* x, vector::Vector* y)
+  { 
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    double nrm = 0.0;
+    cublasStatus_t st= cublasDdot (handle_cublas,  x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm);
+    if (st!=0) {printf("dot product crashed with code %d \n", st);}
+    return nrm;
+  }
+
+  /** 
+   * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in,out] x The vector
+   * @param memspace string containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::scal(const real_type* alpha, vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1);
+    if (st!=0) {
+      ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n";
+    }
+  }
+
+  /** 
+   * @brief axpy i.e, y = alpha*x+y where alpha is a constant
+   * 
+   * @param[in] alpha The constant
+   * @param[in] x The first vector
+   * @param[in,out] y The second vector (result is return in y)
+   * @param[in]  memspace String containg memspace (cpu or cuda)
+   * 
+   */
+  void VectorHandlerCuda::axpy(const  real_type* alpha, vector::Vector* x, vector::Vector* y)
+  {
+    //AXPY:  y = alpha * x + y
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    cublasDaxpy(handle_cublas,
+                x->getSize(),
+                alpha,
+                x->getData("cuda"),
+                1,
+                y->getData("cuda"),
+                1);
+  }
+
+  /** 
+   * @brief gemv computes matrix-vector product where both matrix and vectors are dense.
+   *        i.e., x = beta*x +  alpha*V*y
+   *
+   * @param[in] Transpose - yes (T) or no (N)
+   * @param[in] n Number of rows in (non-transposed) matrix
+   * @param[in] k Number of columns in (non-transposed)   
+   * @param[in] alpha Constant real number
+   * @param[in] beta Constant real number
+   * @param[in] V Multivector containing the matrix, organized columnwise
+   * @param[in] y Vector, k x 1 if N and n x 1 if T
+   * @param[in,out] x Vector, n x 1 if N and k x 1 if T
+   * @param[in] memspace  cpu or cuda (for now)
+   *
+   * @pre   V is stored colum-wise, _n_ > 0, _k_ > 0
+   * 
+   */  
+  void VectorHandlerCuda::gemv(std::string transpose,
+                               index_type n,
+                               index_type k,
+                               const real_type* alpha,
+                               const real_type* beta,
+                               vector::Vector* V,
+                               vector::Vector* y,
+                               vector::Vector* x)
+  {
+    LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+    cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+    if (transpose == "T") {
+
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_T,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+
+    } else {
+      cublasDgemv(handle_cublas,
+                  CUBLAS_OP_N,
+                  n,
+                  k,
+                  alpha,
+                  V->getData("cuda"),
+                  n,
+                  y->getData("cuda"),
+                  1,
+                  beta,
+                  x->getData("cuda"),
+                  1);
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) axpy i.e, y = y - x*alpha where  alpha is a vector
+   * 
+   * @param[in] size number of elements in y
+   * @param[in] alpha vector size k x 1
+   * @param[in] x (multi)vector size size x k
+   * @param[in,out] y vector size size x 1 (this is where the result is stored)
+   * @param[in] memspace string containg memspace (cpu or cuda)
+   *
+   * @pre   _k_ > 0, _size_ > 0, _size_ = x->getSize()
+   *
+   */
+  void VectorHandlerCuda::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y)
+  {
+    using namespace constants;
+    if (k < 200) {
+      mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_N,
+                  CUBLAS_OP_N,
+                  size,       // m
+                  1,          // n
+                  k + 1,      // k
+                  &MINUSONE, // alpha
+                  x->getData("cuda"), // A
+                  size,       // lda
+                  alpha->getData("cuda"), // B
+                  k + 1,      // ldb
+                  &ONE,
+                  y->getData("cuda"),          // c
+                  size);      // ldc     
+    }
+  }
+
+  /** 
+   * @brief mass (bulk) dot product i.e,  V^T x, where V is n x k dense multivector
+   * (a dense multivector consisting of k vectors size n) and x is k x 2 dense
+   * multivector (a multivector consisiting of two vectors size n each)
+   * 
+   * @param[in] size Number of elements in a single vector in V
+   * @param[in] V Multivector; k vectors size n x 1 each
+   * @param[in] k Number of vectors in V
+   * @param[in] x Multivector; 2 vectors size n x 1 each
+   * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res)
+   * @param[in] memspace String containg memspace (cpu or cuda)
+   *
+   * @pre   _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated
+   *
+   */
+  void VectorHandlerCuda::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res)
+  {
+    using namespace constants;
+
+    if (k < 200) {
+      mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda"));
+    } else {
+      LinAlgWorkspaceCUDA* workspaceCUDA = workspace_;
+      cublasHandle_t handle_cublas =  workspaceCUDA->getCublasHandle();
+      cublasDgemm(handle_cublas,
+                  CUBLAS_OP_T,
+                  CUBLAS_OP_N,
+                  k + 1,   //m
+                  2,       //n
+                  size,    //k
+                  &ONE,   //alpha
+                  V->getData("cuda"),       //A
+                  size,    //lda
+                  x->getData("cuda"),       //B
+                  size,    //ldb
+                  &ZERO,
+                  res->getData("cuda"),     //c
+                  k + 1);  //ldc 
+    }
+  }
+
+} // namespace ReSolve
diff --git a/resolve/vector/VectorHandlerCuda.hpp b/resolve/vector/VectorHandlerCuda.hpp
new file mode 100644
index 00000000..0ee2752d
--- /dev/null
+++ b/resolve/vector/VectorHandlerCuda.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class LinAlgWorkspaceCUDA;
+  class VectorHandlerImpl;
+}
+
+
+namespace ReSolve { //namespace vector {
+  class VectorHandlerCuda : public VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerCuda();
+      VectorHandlerCuda(LinAlgWorkspaceCUDA* workspace);
+      virtual ~VectorHandlerCuda();
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y);
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y);
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x);
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y);
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res);
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x);
+    private:
+      LinAlgWorkspaceCUDA* workspace_;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/resolve/vector/VectorHandlerImpl.hpp b/resolve/vector/VectorHandlerImpl.hpp
new file mode 100644
index 00000000..229a7461
--- /dev/null
+++ b/resolve/vector/VectorHandlerImpl.hpp
@@ -0,0 +1,57 @@
+#pragma once
+#include <string>
+
+namespace ReSolve
+{ 
+  namespace vector
+  {
+    class Vector;
+  }
+  class VectorHandlerCpu;
+  class VectorHandlerCuda;
+}
+
+
+namespace ReSolve
+{
+  class VectorHandlerImpl
+  { 
+    public:
+      VectorHandlerImpl()
+      {}
+      virtual ~VectorHandlerImpl()
+      {}
+
+      //y = alpha x + y
+      virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y ) = 0;
+
+      //dot: x \cdot y
+      virtual real_type dot(vector::Vector* x, vector::Vector* y ) = 0;
+
+      //scal = alpha * x
+      virtual void scal(const real_type* alpha, vector::Vector* x) = 0;
+
+      //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise
+      virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) = 0;
+
+      //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise
+      //Size = n
+      virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) = 0;
+
+      /** gemv:
+       * if `transpose = N` (no), `x = beta*x +  alpha*V*y`,
+       * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`.
+       * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`,
+       * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`.
+       */ 
+      virtual void gemv(std::string transpose,
+                        index_type n,
+                        index_type k,
+                        const real_type* alpha,
+                        const real_type* beta,
+                        vector::Vector* V,
+                        vector::Vector* y,
+                        vector::Vector* x) = 0;
+  };
+
+} //} // namespace ReSolve::vector
diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp
index 2d801c49..eddd800e 100644
--- a/tests/unit/vector/GramSchmidtTests.hpp
+++ b/tests/unit/vector/GramSchmidtTests.hpp
@@ -66,8 +66,7 @@ namespace ReSolve {
               break;
           }
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler* handler = new ReSolve::VectorHandler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors
           real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2
@@ -114,7 +113,7 @@ namespace ReSolve {
 
           status *= verifyAnswer(V, 3, handler, memspace_);
           
-          delete workspace;
+          delete handler;
           delete [] H;
           delete V; 
           delete GS;
@@ -125,6 +124,21 @@ namespace ReSolve {
       private:
         std::string memspace_{"cuda"};
 
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") { // TODO: Fix memory leak here
+            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            return new VectorHandler(workpsace);
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+          } else {
+            std::cout << "Invalid memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
         // x is a multivector containing K vectors 
         bool verifyAnswer(vector::Vector* x, index_type K,  ReSolve::VectorHandler* handler, std::string memspace)
         {
diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp
index ad0d3cfb..cc84f25b 100644
--- a/tests/unit/vector/VectorHandlerTests.hpp
+++ b/tests/unit/vector/VectorHandlerTests.hpp
@@ -38,8 +38,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -52,10 +51,10 @@ namespace ReSolve {
 
           real_type alpha = 0.5;
           //the result is a vector with y[i] = 2.5;          
-          handler.axpy(&alpha, x, y, memspace_);
+          handler->axpy(&alpha, x, y, memspace_);
           status *= verifyAnswer(y, 2.5, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -66,8 +65,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x = new vector::Vector(N);
           vector::Vector* y = new vector::Vector(N);
@@ -79,7 +77,7 @@ namespace ReSolve {
           y->setToConst(4.0, memspace_);
           real_type ans;
           //the result is N
-          ans = handler.dot(x, y, memspace_);
+          ans = handler->dot(x, y, memspace_);
 
           bool st = true;;
           if (ans != (real_type) N) {
@@ -88,7 +86,7 @@ namespace ReSolve {
           } 
           status *= st;
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y; 
 
@@ -99,8 +97,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
 
           vector::Vector* x =  new vector::Vector(N);
 
@@ -111,10 +108,10 @@ namespace ReSolve {
           real_type alpha = 3.5;
 
           //the answer is x[i] = 4.375;         
-          handler.scal(&alpha, x, memspace_);
+          handler->scal(&alpha, x, memspace_);
           status *= verifyAnswer(x, 4.375, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
 
           return status.report(__func__);
@@ -124,8 +121,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N);
@@ -148,11 +144,11 @@ namespace ReSolve {
           index_type r = K % 2;
           real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5));
 
-          handler.massAxpy(N, alpha, K, x, y, memspace_);
+          handler->massAxpy(N, alpha, K, x, y, memspace_);
           status *= verifyAnswer(y, 2.0 - res, memspace_);
 
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete alpha;
@@ -164,8 +160,7 @@ namespace ReSolve {
         {
           TestStatus status;
 
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           
           vector::Vector* x =  new vector::Vector(N, K);
           vector::Vector* y =  new vector::Vector(N, 2);
@@ -176,11 +171,11 @@ namespace ReSolve {
           
           x->setToConst(1.0, memspace_);
           y->setToConst(-1.0, memspace_);
-          handler.massDot2Vec(N, x, K, y, res, memspace_);
+          handler->massDot2Vec(N, x, K, y, res, memspace_);
           
           status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_);
 
-          delete workspace;
+          delete handler;
           delete x;
           delete y;
           delete res;
@@ -190,8 +185,8 @@ namespace ReSolve {
         TestOutcome gemv(index_type N,  index_type K)
         {
           TestStatus status;
-          ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
-          ReSolve::VectorHandler handler(workspace);
+          // ReSolve::LinAlgWorkspace* workspace = createLinAlgWorkspace(memspace_);
+          ReSolve::VectorHandler* handler = createVectorHandler();
           vector::Vector* V = new vector::Vector(N, K);
           // for the test with NO TRANSPOSE
           vector::Vector* yN = new vector::Vector(K); 
@@ -214,9 +209,9 @@ namespace ReSolve {
           
           real_type alpha = -1.0;
           real_type beta = 1.0;
-          handler.gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
+          handler->gemv("N", N, K, &alpha, &beta, V, yN, xN, memspace_);
           status *= verifyAnswer(xN, (real_type) (K) + 0.5, memspace_);
-          handler.gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
+          handler->gemv("T", N, K, &alpha, &beta, V, yT, xT, memspace_);
           status *= verifyAnswer(xT, (real_type) (N) + 0.5, memspace_);
 
           return status.report(__func__);
@@ -225,7 +220,22 @@ namespace ReSolve {
       private:
         std::string memspace_{"cpu"};
 
-        // we can verify through norm but that would defeat the purpose of testing vector handler...
+        ReSolve::VectorHandler* createVectorHandler()
+        {
+          if (memspace_ == "cpu") {
+            LinAlgWorkspace* workpsace = new LinAlgWorkspace();
+            return new VectorHandler(workpsace);
+          } else if (memspace_ == "cuda") {
+            LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA();
+            workspace->initializeHandles();
+            return new VectorHandler(workspace);
+          } else {
+            std::cout << "Invalid memory space " << memspace_ << "\n";
+          }
+          return nullptr;
+        }
+
+        // we can verify through norm but that would defeat the purpose of testing vector handler ...
         bool verifyAnswer(vector::Vector* x, real_type answer, std::string memspace)
         {
           bool status = true;