From fc7c3b6455f57909998222a3158625d04448db96 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Fri, 18 May 2018 16:16:32 -0700
Subject: [PATCH 01/58] Add RAJA plugin

---
 CMakeLists.txt                             |   1 +
 cmake/thirdparty/SetupChaiThirdparty.cmake |  11 +
 src/ArrayManager.hpp                       |   2 +
 src/CMakeLists.txt                         |  15 +
 src/RajaExecutionSpacePlugin.cpp           |  89 +++++
 src/RajaExecutionSpacePlugin.hpp           |  70 ++++
 src/pluginLinker.hpp                       |  56 +++
 src/tests/CMakeLists.txt                   |   4 +
 src/tests/integration/CMakeLists.txt       |  31 ++
 src/tests/integration/chai-nested.cpp      | 420 +++++++++++++++++++++
 src/tests/integration/raja-chai-tests.cpp  | 128 +++++++
 11 files changed, 827 insertions(+)
 create mode 100644 src/RajaExecutionSpacePlugin.cpp
 create mode 100644 src/RajaExecutionSpacePlugin.hpp
 create mode 100644 src/pluginLinker.hpp
 create mode 100644 src/tests/integration/CMakeLists.txt
 create mode 100644 src/tests/integration/chai-nested.cpp
 create mode 100644 src/tests/integration/raja-chai-tests.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b97c911..f0b13b51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ option(ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw poin
 option(DISABLE_RM "Make ManagedArray a thin wrapper" Off)
 mark_as_advanced(DISABLE_RM)
 option(ENABLE_UM "Use CUDA unified (managed) memory" Off)
+option(ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" On)
 
 if (ENABLE_UM AND NOT ENABLE_CUDA)
   message(FATAL_ERROR "Option ENABLE_UM requires ENABLE_CUDA")
diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index 20f12b7e..030d1249 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -47,3 +47,14 @@ blt_register_library(
   NAME umpire
   INCLUDES ${UMPIRE_INCLUDE_DIRS}
   LIBRARIES umpire)
+
+if (ENABLE_RAJA_PLUGIN)
+  find_package(RAJA REQUIRED)
+  
+  blt_register_library(
+    NAME raja
+    INCLUDES ${RAJA_INCLUDE_DIR}
+    LIBRARIES RAJA)
+
+  message(STATUS "RAJA: ${RAJA_INCLUDE_DIR}")
+endif ()
diff --git a/src/ArrayManager.hpp b/src/ArrayManager.hpp
index 3ddfc369..4088b17d 100644
--- a/src/ArrayManager.hpp
+++ b/src/ArrayManager.hpp
@@ -47,6 +47,8 @@
 #include "chai/PointerRecord.hpp"
 #include "chai/Types.hpp"
 
+#include "chai/pluginLinker.hpp"
+
 #include <unordered_map>
 
 #include "umpire/Allocator.hpp"
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index f4837f45..6c1a60bc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -54,6 +54,7 @@ set (chai_headers
   ArrayManager.hpp
   ArrayManager.inl
   ChaiMacros.hpp
+  pluginLinker.hpp
   ExecutionSpaces.hpp
   ManagedArray.hpp
   ManagedArray.inl
@@ -78,6 +79,20 @@ if (ENABLE_CUDA)
     cuda_runtime)
 endif ()
 
+if (ENABLE_RAJA_PLUGIN)
+  set (chai_headers
+    ${chai_headers}
+    RajaExecutionSpacePlugin.hpp)
+
+  set (chai_sources
+    ${chai_sources}
+    RajaExecutionSpacePlugin.cpp)
+
+  set (chai_depends
+    ${chai_depends}
+    raja)
+endif ()
+
 blt_add_library(
   NAME chai
   SOURCES ${chai_sources}
diff --git a/src/RajaExecutionSpacePlugin.cpp b/src/RajaExecutionSpacePlugin.cpp
new file mode 100644
index 00000000..e46279da
--- /dev/null
+++ b/src/RajaExecutionSpacePlugin.cpp
@@ -0,0 +1,89 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+// 
+// Produced at the Lawrence Livermore National Laboratory.
+// 
+// This file is part of CHAI.
+// 
+// LLNL-CODE-705877
+// 
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// 
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+// 
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#include "chai/config.hpp"
+
+#include "chai/RajaExecutionSpacePlugin.hpp"
+
+#include "chai/ArrayManager.hpp"
+
+namespace chai {
+
+RajaExecutionSpacePlugin::RajaExecutionSpacePlugin() :
+  m_arraymanager(chai::ArrayManager::getInstance())
+{
+}
+
+void 
+RajaExecutionSpacePlugin::preLaunch(RAJA::util::PluginContext p)
+{
+  switch (p.platform) {
+    case RAJA::Platform::host: 
+      m_arraymanager->setExecutionSpace(chai::CPU); break;
+#if defined(CHAI_ENABLE_CUDA)
+    case RAJA::Platform::cuda:
+      m_arraymanager->setExecutionSpace(chai::GPU); break;
+#endif
+    default: 
+      m_arraymanager->setExecutionSpace(chai::NONE);
+  }
+
+}
+
+void 
+RajaExecutionSpacePlugin::postLaunch(RAJA::util::PluginContext)
+{
+  m_arraymanager->setExecutionSpace(chai::NONE);
+}
+
+}
+
+// Register plugin with RAJA
+RAJA::util::PluginRegistry::Add<chai::RajaExecutionSpacePlugin> P(
+     "RajaExecutionSpacePlugin",
+     "Plugin to set CHAI execution space based on RAJA execution platform");
+
+namespace chai {
+
+  void linkRajaPlugin() {}
+
+}
diff --git a/src/RajaExecutionSpacePlugin.hpp b/src/RajaExecutionSpacePlugin.hpp
new file mode 100644
index 00000000..dfbba300
--- /dev/null
+++ b/src/RajaExecutionSpacePlugin.hpp
@@ -0,0 +1,70 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+// 
+// Produced at the Lawrence Livermore National Laboratory.
+// 
+// This file is part of CHAI.
+// 
+// LLNL-CODE-705877
+// 
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// 
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+// 
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#ifndef CHAI_RajaExecutionSpacePlugin_HPP
+#define CHAI_RajaExecutionSpacePlugin_HPP
+
+#include "RAJA/util/PluginStrategy.hpp"
+
+namespace chai {
+
+class ArrayManager;
+
+class RajaExecutionSpacePlugin :
+  public RAJA::util::PluginStrategy
+{
+  public:
+    RajaExecutionSpacePlugin();
+
+    void preLaunch(RAJA::util::PluginContext p);
+
+    void postLaunch(RAJA::util::PluginContext p);
+
+  private:
+    chai::ArrayManager* m_arraymanager;
+};
+
+void linkRajaPlugin(); 
+
+}
+
+#endif // CHAI_RajaExecutionSpacePlugin_HPP
diff --git a/src/pluginLinker.hpp b/src/pluginLinker.hpp
new file mode 100644
index 00000000..9e8484a2
--- /dev/null
+++ b/src/pluginLinker.hpp
@@ -0,0 +1,56 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+// 
+// Produced at the Lawrence Livermore National Laboratory.
+// 
+// This file is part of CHAI.
+// 
+// LLNL-CODE-705877
+// 
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// 
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+// 
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#ifndef CHAI_pluginLinker_HPP
+#define CHAI_pluginLinker_HPP
+
+#include "chai/RajaExecutionSpacePlugin.hpp"
+
+namespace {
+  struct pluginLinker {
+    pluginLinker() {
+      (void) chai::linkRajaPlugin();
+    }
+  } pluginLinker;
+}
+
+#endif // CHAI_pluginLinker_HPP
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 7f472780..5a575abb 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -62,3 +62,7 @@ target_include_directories(
 blt_add_test(
   NAME managed_array_test
   COMMAND managed_array_tests)
+
+if (ENABLE_RAJA_PLUGIN)
+  add_subdirectory(integration)
+endif ()
diff --git a/src/tests/integration/CMakeLists.txt b/src/tests/integration/CMakeLists.txt
new file mode 100644
index 00000000..dc34cbfb
--- /dev/null
+++ b/src/tests/integration/CMakeLists.txt
@@ -0,0 +1,31 @@
+###############################################################################
+#
+# Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
+#
+# Produced at the Lawrence Livermore National Laboratory
+#
+# LLNL-CODE-689114
+#
+# All rights reserved.
+#
+# This file is part of RAJA.
+#
+# For details about use and distribution, please read RAJA/LICENSE.
+#
+###############################################################################
+
+set (raja_test_depends
+  chai umpire raja openmp gtest)
+
+blt_add_executable(
+  NAME raja-chai-tests 
+  SOURCES raja-chai-tests.cpp
+  DEPENDS_ON ${raja_test_depends})
+
+blt_add_test(
+  NAME raja-chai-tests
+  COMMAND raja-chai-tests)
+
+target_include_directories(
+  raja-chai-tests
+  PUBLIC ${PROJECT_BINARY_DIR}/include)
diff --git a/src/tests/integration/chai-nested.cpp b/src/tests/integration/chai-nested.cpp
new file mode 100644
index 00000000..ff50af96
--- /dev/null
+++ b/src/tests/integration/chai-nested.cpp
@@ -0,0 +1,420 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI in RAJA nested loops.
+///
+///
+#include <time.h>
+#include <cfloat>
+#include <cstdlib>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+#include "RAJA_gtest.hpp"
+
+using namespace RAJA;
+using namespace std;
+
+#include "chai/ArrayManager.hpp"
+#include "chai/ManagedArray.hpp"
+
+/*
+ * Simple tests using forallN and View
+ */
+CUDA_TEST(Chai, NestedSimpleOld) {
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_y_exec > > POLICY_GPU;
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1(X*Y);
+  chai::ManagedArray<float> v2(X*Y);
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      int index = j*X + i;
+      v1[index] = index;
+  });
+
+  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
+      int index = j*X + i;
+      v2[index] = v1[index]*2.0f;
+  });
+  cudaDeviceSynchronize();
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      int index = j*X + i;
+      ASSERT_FLOAT_EQ(v1[index], index*1.0f);
+      ASSERT_FLOAT_EQ(v2[index], index*2.0f);
+  });
+}
+
+
+/*
+ * Simple tests using nested::forall and View
+ */
+CUDA_TEST(Chai, NestedSimple) {
+  typedef RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec,
+        RAJA::statement::For<1, RAJA::seq_exec> > > POLICY;
+  typedef RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec,
+        RAJA::statement::CudaKernel<
+          RAJA::statement::For<1, RAJA::cuda_threadblock_exec<32> > > > >POLICY_GPU;
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1(X*Y);
+  chai::ManagedArray<float> v2(X*Y);
+
+  RAJA::kernel<POLICY>(
+
+      RAJA::make_tuple(RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,X) ),
+
+      [=] (int i, int j) {
+        int index = j*X + i;
+        v1[index] = index;
+  });
+
+  RAJA::kernel<POLICY_GPU>(
+
+      RAJA::make_tuple(RangeSegment(0,Y), RangeSegment(0,X) ),
+
+      [=] __host__ __device__ (int i, int j) {
+        int index = j*X + i;
+        v2[index] = v1[index]*2.0f;
+  });
+
+  cudaDeviceSynchronize();
+
+  RAJA::kernel<POLICY>(
+
+      RAJA::make_tuple(RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,X) ),
+
+      [=] (int i, int j) {
+        int index = j*X + i;
+        ASSERT_FLOAT_EQ(v1[index], index*1.0f);
+        ASSERT_FLOAT_EQ(v2[index], index*2.0f);
+  });
+}
+
+CUDA_TEST(Chai, NestedView) {
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_y_exec > > POLICY_GPU;
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1_array(X*Y);
+  chai::ManagedArray<float> v2_array(X*Y);
+
+  typedef RAJA::ManagedArrayView<float, RAJA::Layout<2> > view;
+
+  view v1(v1_array, X, Y);
+  view v2(v2_array, X, Y);
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      v1(i,j) = (i+(j*X)) * 1.0f;
+  });
+
+  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
+      v2(i,j) = v1(i,j)*2.0f;
+  });
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      ASSERT_FLOAT_EQ(v2(i,j), v1(i,j)*2.0f);
+  });
+}
+
+CUDA_TEST(Chai, NestedView2) {
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
+
+#if defined (RAJA_ENABLE_OPENMP)
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::omp_for_nowait_exec, RAJA::cuda_thread_x_exec >, RAJA::OMP_Parallel<> > POLICY_GPU;
+#else
+  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_x_exec > > POLICY_GPU;
+#endif
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1_array(X*Y);
+  chai::ManagedArray<float> v2_array(X*Y);
+
+  typedef RAJA::ManagedArrayView<float, RAJA::Layout<2> > view;
+
+  view v1(v1_array, X, Y);
+  view v2(v2_array, X, Y);
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      v1(i,j) = (i+(j*X)) * 1.0f;
+  });
+
+  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
+      v2(i,j) = v1(i,j)*2.0f;
+  });
+
+  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
+      ASSERT_FLOAT_EQ(v2(i,j), v1(i,j)*2.0f);
+  });
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Example LTimes kernel test routines
+//
+// Demonstrates a 4-nested loop, the use of complex nested policies and
+// the use of strongly-typed indices
+//
+// This routine computes phi(m, g, z) = SUM_d {  ell(m, d)*psi(d,g,z)  }
+//
+///////////////////////////////////////////////////////////////////////////
+typedef struct {
+  double val;
+  int idx;
+} minmaxloc_t;
+
+// block_size is needed by the reduction variables to setup shared memory
+// Care should be used here to cover the maximum block dimensions used by this
+// test
+const size_t block_size = 256;
+
+RAJA_INDEX_VALUE(IMoment, "IMoment");
+RAJA_INDEX_VALUE(IDirection, "IDirection");
+RAJA_INDEX_VALUE(IGroup, "IGroup");
+RAJA_INDEX_VALUE(IZone, "IZone");
+
+template <typename POL>
+void runLTimesTest(std::string const &policy,
+                   Index_type num_moments,
+                   Index_type num_directions,
+                   Index_type num_groups,
+                   Index_type num_zones)
+{
+  // cout << "\n TestLTimes " << num_moments << " moments, " << num_directions
+  //      << " directions, " << num_groups << " groups, and " << num_zones
+  //      << " zones"
+  //      << " with policy " << policy << endl;
+
+  // allocate data
+  // phi is initialized to all zeros, the others are randomized
+  chai::ManagedArray<double> ell_data(num_moments * num_directions);
+  chai::ManagedArray<double> psi_data(num_directions * num_groups * num_zones);
+  //chai::ManagedArray<double> phi_data(num_moments * num_groups * num_zones, 0.0);
+  chai::ManagedArray<double> phi_data(num_moments * num_groups * num_zones);
+
+  // setup CUDA Reduction variables to be exercised
+  ReduceSum<cuda_reduce<block_size>, double> pdsum(0.0);
+  ReduceMin<cuda_reduce<block_size>, double> pdmin(DBL_MAX);
+  ReduceMax<cuda_reduce<block_size>, double> pdmax(-DBL_MAX);
+  ReduceMinLoc<cuda_reduce<block_size>, double> pdminloc(DBL_MAX, -1);
+  ReduceMaxLoc<cuda_reduce<block_size>, double> pdmaxloc(-DBL_MAX, -1);
+
+
+  // data setup using RAJA to ensure that chai is activated
+  RAJA::forall<RAJA::seq_exec>(0, (num_moments*num_directions), [=] (int i) {
+    ell_data[i] = drand48();
+  });
+
+  RAJA::forall<RAJA::seq_exec>(0, (num_directions*num_groups*num_zones), [=] (int i) {
+    psi_data[i] = drand48();
+  });
+
+  RAJA::forall<RAJA::seq_exec>(0, (num_moments*num_groups*num_zones), [=] (int i) {
+    phi_data[i] = 0.0;
+  });
+
+  typename POL::ELL_VIEW ell(ell_data, RAJA::make_permuted_layout({num_moments, num_directions}, RAJA::as_array<typename POL::ELL_PERM>::get()));
+  typename POL::PSI_VIEW psi(psi_data, RAJA::make_permuted_layout({num_directions, num_groups, num_zones}, RAJA::as_array<typename POL::PSI_PERM>::get()));
+  typename POL::PHI_VIEW phi(phi_data, RAJA::make_permuted_layout({num_moments, num_groups, num_zones}, RAJA::as_array<typename POL::PHI_PERM>::get()));
+
+  using EXEC = typename POL::EXEC;
+
+  // do calculation using RAJA
+  forallN<EXEC, IMoment, IDirection, IGroup, IZone>(
+      RangeSegment(0, num_moments),
+      RangeSegment(0, num_directions),
+      RangeSegment(0, num_groups),
+      RangeSegment(0, num_zones),
+      [=] __device__(IMoment m, IDirection d, IGroup g, IZone z) {
+        double val = ell(m, d) * psi(d, g, z);
+        phi(m, g, z) += val;
+        pdsum += val;
+        pdmin.min(val);
+        pdmax.max(val);
+
+        int index = *d + (*m * num_directions)
+                    + (*g * num_directions * num_moments)
+                    + (*z * num_directions * num_moments * num_groups);
+
+        pdminloc.minloc(val, index);
+        pdmaxloc.maxloc(val, index);
+      });
+
+  cudaDeviceSynchronize();
+
+  // Make sure data is copied to host for checking results.
+  chai::ArrayManager* rm = chai::ArrayManager::getInstance();
+  rm->setExecutionSpace(chai::CPU);
+  // setup local Reduction variables as a crosscheck
+  double the_lsum = 0.0;
+  double the_lmin = DBL_MAX;
+  double the_lmax = -DBL_MAX;
+
+  double* lsum = &the_lsum;
+  double* lmin = &the_lmin;
+  double* lmax = &the_lmax;
+
+  forall<RAJA::seq_exec>(RangeSegment(0, num_zones), [=] (int z) {
+    for (IGroup g(0); g < num_groups; ++g) {
+      for (IMoment m(0); m < num_moments; ++m) {
+        double total = 0.0;
+        for (IDirection d(0); d < num_directions; ++d) {
+          double val = ell(m, d) * psi(d, g, IZone(z));
+          total += val;
+          *lmin = RAJA_MIN(*lmin, val);
+          *lmax = RAJA_MAX(*lmax, val);
+          int index = *d + (*m * num_directions)
+                      + (*g * num_directions * num_moments)
+                      + (z * num_directions * num_moments * num_groups);
+        }
+        *lsum += total;
+
+        // check answer with some reasonable tolerance
+        ASSERT_FLOAT_EQ(total, phi(m, g, IZone(z)));
+      }
+    }
+  });
+
+  rm->setExecutionSpace(chai::NONE);
+
+  ASSERT_FLOAT_EQ(*lsum, pdsum.get());
+  ASSERT_FLOAT_EQ(*lmin, pdmin.get());
+  ASSERT_FLOAT_EQ(*lmax, pdmax.get());
+}
+
+// Use thread-block mappings
+struct PolLTimesA_GPU {
+  // Loops: Moments, Directions, Groups, Zones
+  typedef NestedPolicy<ExecList<seq_exec,
+                                seq_exec,
+                                cuda_threadblock_x_exec<32>,
+                                cuda_threadblock_y_exec<32>>>
+      EXEC;
+
+  // psi[direction, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
+      PSI_VIEW;
+
+  // phi[moment, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
+      PHI_VIEW;
+
+  // ell[moment, direction]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
+      ELL_VIEW;
+
+  typedef RAJA::PERM_IJK PSI_PERM;
+  typedef RAJA::PERM_IJK PHI_PERM;
+  typedef RAJA::PERM_IJ ELL_PERM;
+};
+
+// Use thread and block mappings
+struct PolLTimesB_GPU {
+  // Loops: Moments, Directions, Groups, Zones
+  typedef NestedPolicy<ExecList<seq_exec,
+                                seq_exec,
+                                cuda_thread_z_exec,
+                                cuda_block_y_exec>,
+                       Permute<PERM_IJKL>>
+      EXEC;
+
+  // psi[direction, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
+      PSI_VIEW;
+
+  // phi[moment, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
+      PHI_VIEW;
+
+  // ell[moment, direction]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
+      ELL_VIEW;
+
+  typedef RAJA::PERM_IJK PSI_PERM;
+  typedef RAJA::PERM_IJK PHI_PERM;
+  typedef RAJA::PERM_IJ ELL_PERM;
+};
+
+// Combine OMP Parallel, omp nowait, and cuda thread-block launch
+struct PolLTimesC_GPU {
+  // Loops: Moments, Directions, Groups, Zones
+#if defined(RAJA_ENABLE_OPENMP)
+  typedef NestedPolicy<ExecList<seq_exec,
+                                seq_exec,
+                                omp_for_nowait_exec,
+                                cuda_threadblock_y_exec<32>>,
+                       OMP_Parallel<>>
+      EXEC;
+#else
+  typedef NestedPolicy<ExecList<seq_exec,
+                                seq_exec,
+                                seq_exec,
+                                cuda_threadblock_y_exec<32>> >
+      EXEC;
+#endif
+
+  // psi[direction, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
+      PSI_VIEW;
+
+  // phi[moment, group, zone]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
+      PHI_VIEW;
+
+  // ell[moment, direction]
+  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
+      ELL_VIEW;
+
+  typedef RAJA::PERM_IJK PSI_PERM;
+  typedef RAJA::PERM_IJK PHI_PERM;
+  typedef RAJA::PERM_IJ ELL_PERM;
+};
+
+void runLTimesTests(Index_type num_moments,
+                    Index_type num_directions,
+                    Index_type num_groups,
+                    Index_type num_zones)
+{
+  runLTimesTest<PolLTimesA_GPU>(
+      "PolLTimesA_GPU", num_moments, num_directions, num_groups, num_zones);
+  runLTimesTest<PolLTimesB_GPU>(
+      "PolLTimesB_GPU", num_moments, num_directions, num_groups, num_zones);
+  runLTimesTest<PolLTimesC_GPU>(
+      "PolLTimesC_GPU", num_moments, num_directions, num_groups, num_zones);
+}
+
+TEST(Chai, Nested) {
+//  runLTimesTests(2, 0, 7, 3);
+  runLTimesTests(2, 3, 7, 3);
+  runLTimesTests(2, 3, 32, 4);
+  runLTimesTests(25, 96, 8, 32);
+  runLTimesTests(100, 15, 7, 13);
+}
diff --git a/src/tests/integration/raja-chai-tests.cpp b/src/tests/integration/raja-chai-tests.cpp
new file mode 100644
index 00000000..aa3322ce
--- /dev/null
+++ b/src/tests/integration/raja-chai-tests.cpp
@@ -0,0 +1,128 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
+//
+// Produced at the Lawrence Livermore National Laboratory
+//
+// LLNL-CODE-689114
+//
+// All rights reserved.
+//
+// This file is part of RAJA.
+//
+// For details about use and distribution, please read RAJA/LICENSE.
+//
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI with basic RAJA constructs
+///
+
+#include "gtest/gtest.h"
+
+#include "chai/ManagedArray.hpp"
+#include "chai/RajaExecutionSpacePlugin.hpp"
+
+#define CUDA_TEST(X, Y) \
+static void cuda_test_ ## X ## Y();\
+TEST(X,Y) { cuda_test_ ## X ## Y();}\
+static void cuda_test_ ## X ## Y()
+
+#include "RAJA/RAJA.hpp"
+
+// // Register plugin with RAJA
+// static RAJA::util::PluginRegistry::Add<chai::RajaExecutionSpacePlugin> P(
+//     "RajaExecutionSpacePlugin",
+ //    "Plugin to set CHAI execution space based on RAJA execution platform");
+
+
+#include <iostream>
+
+CUDA_TEST(ChaiTest, Simple) {
+  chai::ManagedArray<float> v1(10);
+  chai::ManagedArray<float> v2(10);
+
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=] (int i) {
+      v1[i] = static_cast<float>(i * 1.0f);
+  });
+
+  std::cout << "end of loop 1" << std::endl;
+
+
+#if defined(CHAI_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+      v2[i] = v1[i]*2.0f;
+  });
+#else
+  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=] (int i) {
+      v2[i] = v1[i]*2.0f;
+  });
+#endif
+
+  std::cout << "end of loop 2" << std::endl;
+
+  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
+      ASSERT_FLOAT_EQ(v2[i], i*2.0f);
+  });
+
+
+#if defined(CHAI_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+      v2[i] *= 2.0f;
+  });
+#else
+  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=] (int i) {
+      v2[i] *= 2.0f;
+  });
+#endif
+
+  float * raw_v2 = v2;
+  for (int i = 0; i < 10; i++ ) {
+      ASSERT_FLOAT_EQ(raw_v2[i], i*2.0f*2.0f);;
+  }
+}
+
+#if 0
+CUDA_TEST(ChaiTest, Views) {
+  chai::ManagedArray<float> v1_array(10);
+  chai::ManagedArray<float> v2_array(10);
+
+  typedef RAJA::ManagedArrayView<float, RAJA::Layout<1> > view;
+
+  view v1(v1_array, 10);
+  view v2(v2_array, 10);
+
+  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
+      v1(i) = static_cast<float>(i * 1.0f);
+  });
+
+#if defined(CHAI_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+      v2(i) = v1(i)*2.0f;
+  });
+#else
+  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=](int i) {
+      v2(i) = v1(i)*2.0f;
+  });
+#endif
+
+  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
+      ASSERT_FLOAT_EQ(v2(i), i*2.0f);
+  });
+
+
+#if defined(CHAI_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+      v2(i) *= 2.0f;
+  });
+#else
+  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=](int i) {
+      v2(i) *= 2.0f;
+  });
+#endif
+
+  float * raw_v2 = v2.data;
+  for (int i = 0; i < 10; i++ ) {
+      ASSERT_FLOAT_EQ(raw_v2[i], i*1.0f*2.0f*2.0f);;
+  }
+}
+#endif

From 7a9b3352e643a4b1a77419445cdccf53d70c19d0 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Mon, 21 May 2018 14:08:16 -0700
Subject: [PATCH 02/58] Add ManagedArrayView to CHAI

---
 src/CMakeLists.txt       |  2 ++
 src/ManagedArrayView.hpp | 70 ++++++++++++++++++++++++++++++++++++++++
 src/config.hpp.in        |  1 +
 3 files changed, 73 insertions(+)
 create mode 100644 src/ManagedArrayView.hpp

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6c1a60bc..af0593ac 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -45,6 +45,7 @@ set(CHAI_ENABLE_CUDA ${ENABLE_CUDA})
 set(CHAI_ENABLE_IMPLICIT_CONVERSIONS ${ENABLE_IMPLICIT_CONVERSIONS})
 set(CHAI_DISABLE_RM ${DISABLE_RM})
 set(CHAI_ENABLE_UM ${ENABLE_UM})
+set(CHAI_ENABLE_RAJA_PLUGIN ${ENABLE_RAJA_PLUGIN})
 
 configure_file(
   ${PROJECT_SOURCE_DIR}/src/config.hpp.in
@@ -82,6 +83,7 @@ endif ()
 if (ENABLE_RAJA_PLUGIN)
   set (chai_headers
     ${chai_headers}
+    ManagedArrayView.hpp
     RajaExecutionSpacePlugin.hpp)
 
   set (chai_sources
diff --git a/src/ManagedArrayView.hpp b/src/ManagedArrayView.hpp
new file mode 100644
index 00000000..1d7543b7
--- /dev/null
+++ b/src/ManagedArrayView.hpp
@@ -0,0 +1,70 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2018, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+// 
+// Produced at the Lawrence Livermore National Laboratory.
+// 
+// This file is part of CHAI.
+// 
+// LLNL-CODE-705877
+// 
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+// 
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+// 
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+// 
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+// 
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#ifndef CHAI_ManagedArrayView_HPP
+#define CHAI_ManagedArrayView_HPP
+
+#if defined(CHAI_ENABLE_RAJA_PLUGIN)
+
+#include "chai/config.hpp"
+#include "chai/ManagedArray.hpp"
+
+#include "RAJA/util/View.hpp"
+
+namespace chai {
+
+template <typename ValueType, typename LayoutType>
+using ManagedArrayView =
+    RAJA::View<ValueType, LayoutType, chai::ManagedArray<ValueType>>;
+
+
+template <typename ValueType, typename LayoutType, typename... IndexTypes>
+using TypedManagedArrayView = RAJA::TypedViewBase<ValueType,
+                                            chai::ManagedArray<ValueType>,
+                                            LayoutType,
+                                            IndexTypes...>;
+
+} // end of namespace chai
+
+#endif // defined(CHAI_ENABLE_RAJA_PLUGIN)
+
+#endif // CHAI_ManagedArrayView_HPP
diff --git a/src/config.hpp.in b/src/config.hpp.in
index 75610721..22c3c63f 100644
--- a/src/config.hpp.in
+++ b/src/config.hpp.in
@@ -47,5 +47,6 @@
 #cmakedefine CHAI_ENABLE_IMPLICIT_CONVERSIONS
 #cmakedefine CHAI_DISABLE_RM
 #cmakedefine CHAI_ENABLE_UM
+#cmakedefine CHAI_ENABLE_RAJA_PLUGIN
 
 #endif // CHAI_config_HPP

From 2bb71f9e1236e7a81438c571082774b5adc50bf0 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Tue, 29 May 2018 10:15:52 -0700
Subject: [PATCH 03/58] Add ManagedArrayView

---
 src/ManagedArrayView.hpp              | 4 ++--
 src/tests/integration/chai-nested.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ManagedArrayView.hpp b/src/ManagedArrayView.hpp
index 1d7543b7..afce47dd 100644
--- a/src/ManagedArrayView.hpp
+++ b/src/ManagedArrayView.hpp
@@ -43,11 +43,11 @@
 #ifndef CHAI_ManagedArrayView_HPP
 #define CHAI_ManagedArrayView_HPP
 
+#include "chai/config.hpp"
+
 #if defined(CHAI_ENABLE_RAJA_PLUGIN)
 
-#include "chai/config.hpp"
 #include "chai/ManagedArray.hpp"
-
 #include "RAJA/util/View.hpp"
 
 namespace chai {
diff --git a/src/tests/integration/chai-nested.cpp b/src/tests/integration/chai-nested.cpp
index ff50af96..97790808 100644
--- a/src/tests/integration/chai-nested.cpp
+++ b/src/tests/integration/chai-nested.cpp
@@ -125,7 +125,7 @@ CUDA_TEST(Chai, NestedView) {
   chai::ManagedArray<float> v1_array(X*Y);
   chai::ManagedArray<float> v2_array(X*Y);
 
-  typedef RAJA::ManagedArrayView<float, RAJA::Layout<2> > view;
+  typedef chai::ManagedArrayView<float, RAJA::Layout<2> > view;
 
   view v1(v1_array, X, Y);
   view v2(v2_array, X, Y);

From 754d6cecd726988c95149c686969c7d83532e63d Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 14 Aug 2019 12:18:56 -0700
Subject: [PATCH 04/58] Add check to guard examples

---
 CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a7baa16a..fefd839f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -111,7 +111,9 @@ if (ENABLE_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
 
-add_subdirectory(examples)
+if (ENABLE_EXAMPLES)
+  add_subdirectory(examples)
+endif ()
 
 if (ENABLE_DOCUMENTATION)
   add_subdirectory(docs)

From 36ec2131157b4491ff9e7315cb72ead056f8a451 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 12 Sep 2019 11:13:47 -0700
Subject: [PATCH 05/58] More merging

---
 cmake/thirdparty/SetupChaiThirdparty.cmake    |  1 +
 examples/CMakeLists.txt                       |  6 ++--
 src/{ => chai}/ManagedArrayView.hpp           |  0
 src/{ => chai}/RajaExecutionSpacePlugin.cpp   |  0
 src/{ => chai}/RajaExecutionSpacePlugin.hpp   |  0
 src/{ => chai}/pluginLinker.hpp               |  0
 src/tests/integration/CMakeLists.txt          | 31 -------------------
 tests/integration/CMakeLists.txt              | 16 ++++++++++
 .../integration/chai-nested.cpp               |  0
 .../integration/raja-chai-tests.cpp           |  0
 10 files changed, 20 insertions(+), 34 deletions(-)
 rename src/{ => chai}/ManagedArrayView.hpp (100%)
 rename src/{ => chai}/RajaExecutionSpacePlugin.cpp (100%)
 rename src/{ => chai}/RajaExecutionSpacePlugin.hpp (100%)
 rename src/{ => chai}/pluginLinker.hpp (100%)
 delete mode 100644 src/tests/integration/CMakeLists.txt
 rename {src/tests => tests}/integration/chai-nested.cpp (100%)
 rename {src/tests => tests}/integration/raja-chai-tests.cpp (100%)

diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index 08ab19af..afc7ebf0 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -59,6 +59,7 @@ blt_register_library(
   LIBRARIES umpire)
 
 if (ENABLE_RAJA_PLUGIN)
+  find_package(camp REQUIRED)
   find_package(RAJA REQUIRED)
   
   blt_register_library(
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 47d8da4b..1c6e1b1b 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -71,6 +71,6 @@ if (ENABLE_CUDA OR ENABLE_HIP)
     DEPENDS_ON ${chai_umpire_example_depends})
 endif ()
 
-if (ENABLE_RAJA_PLUGIN)
-  add_subdirectory(integration)
-endif ()
+# if (ENABLE_RAJA_PLUGIN)
+#   add_subdirectory(integration)
+# endif ()
diff --git a/src/ManagedArrayView.hpp b/src/chai/ManagedArrayView.hpp
similarity index 100%
rename from src/ManagedArrayView.hpp
rename to src/chai/ManagedArrayView.hpp
diff --git a/src/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp
similarity index 100%
rename from src/RajaExecutionSpacePlugin.cpp
rename to src/chai/RajaExecutionSpacePlugin.cpp
diff --git a/src/RajaExecutionSpacePlugin.hpp b/src/chai/RajaExecutionSpacePlugin.hpp
similarity index 100%
rename from src/RajaExecutionSpacePlugin.hpp
rename to src/chai/RajaExecutionSpacePlugin.hpp
diff --git a/src/pluginLinker.hpp b/src/chai/pluginLinker.hpp
similarity index 100%
rename from src/pluginLinker.hpp
rename to src/chai/pluginLinker.hpp
diff --git a/src/tests/integration/CMakeLists.txt b/src/tests/integration/CMakeLists.txt
deleted file mode 100644
index dc34cbfb..00000000
--- a/src/tests/integration/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-###############################################################################
-#
-# Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
-#
-# Produced at the Lawrence Livermore National Laboratory
-#
-# LLNL-CODE-689114
-#
-# All rights reserved.
-#
-# This file is part of RAJA.
-#
-# For details about use and distribution, please read RAJA/LICENSE.
-#
-###############################################################################
-
-set (raja_test_depends
-  chai umpire raja openmp gtest)
-
-blt_add_executable(
-  NAME raja-chai-tests 
-  SOURCES raja-chai-tests.cpp
-  DEPENDS_ON ${raja_test_depends})
-
-blt_add_test(
-  NAME raja-chai-tests
-  COMMAND raja-chai-tests)
-
-target_include_directories(
-  raja-chai-tests
-  PUBLIC ${PROJECT_BINARY_DIR}/include)
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 22cbdd04..021b5147 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -24,3 +24,19 @@ target_include_directories(
 blt_add_test(
   NAME managed_array_test
   COMMAND managed_array_tests)
+
+set (raja_test_depends
+  chai umpire raja openmp gtest)
+
+blt_add_executable(
+  NAME raja-chai-tests 
+  SOURCES raja-chai-tests.cpp
+  DEPENDS_ON ${raja_test_depends})
+
+blt_add_test(
+  NAME raja-chai-tests
+  COMMAND raja-chai-tests)
+
+target_include_directories(
+  raja-chai-tests
+  PUBLIC ${PROJECT_BINARY_DIR}/include)
diff --git a/src/tests/integration/chai-nested.cpp b/tests/integration/chai-nested.cpp
similarity index 100%
rename from src/tests/integration/chai-nested.cpp
rename to tests/integration/chai-nested.cpp
diff --git a/src/tests/integration/raja-chai-tests.cpp b/tests/integration/raja-chai-tests.cpp
similarity index 100%
rename from src/tests/integration/raja-chai-tests.cpp
rename to tests/integration/raja-chai-tests.cpp

From c0f3a819d3951a7461b07833e4ce7e410686b50b Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 12 Sep 2019 12:52:09 -0700
Subject: [PATCH 06/58] Fixing up tests

---
 tests/integration/CMakeLists.txt      | 12 ++++++++++--
 tests/integration/raja-chai-tests.cpp | 10 +++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 021b5147..64c2dfe9 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -6,12 +6,19 @@ if (ENABLE_CUDA)
     ${managed_array_test_depends}
     cuda)
 endif ()
+
 if (ENABLE_HIP)
   set (managed_array_test_depends
     ${managed_array_test_depends}
     hip)
 endif ()
 
+if (ENABLE_OPENMP)
+  set (managed_array_test_depends
+    ${managed_array_test_depends}
+    openmp)
+endif ()
+
 blt_add_executable(
   NAME managed_array_tests
   SOURCES managed_array_tests.cpp
@@ -25,8 +32,9 @@ blt_add_test(
   NAME managed_array_test
   COMMAND managed_array_tests)
 
-set (raja_test_depends
-  chai umpire raja openmp gtest)
+set(raja_test_depends
+  ${managed_array_test_depends}
+  raja)
 
 blt_add_executable(
   NAME raja-chai-tests 
diff --git a/tests/integration/raja-chai-tests.cpp b/tests/integration/raja-chai-tests.cpp
index aa3322ce..713dd1ec 100644
--- a/tests/integration/raja-chai-tests.cpp
+++ b/tests/integration/raja-chai-tests.cpp
@@ -49,28 +49,28 @@ CUDA_TEST(ChaiTest, Simple) {
 
 
 #if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__ (int i) {
       v2[i] = v1[i]*2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=] (int i) {
+  RAJA::forall<RAJA::omp_for_exec >(RAJA::RangeSegment(0, 10), [=] (int i) {
       v2[i] = v1[i]*2.0f;
   });
 #endif
 
   std::cout << "end of loop 2" << std::endl;
 
-  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=] (int i) {
       ASSERT_FLOAT_EQ(v2[i], i*2.0f);
   });
 
 
 #if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__ (int i) {
       v2[i] *= 2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=] (int i) {
+  RAJA::forall<RAJA::omp_for_exec >(RAJA::RangeSegment(0, 10), [=] (int i) {
       v2[i] *= 2.0f;
   });
 #endif

From 2fe6a3876b2f16c1bf235f43c6fde0e26596105b Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 12 Sep 2019 13:27:56 -0700
Subject: [PATCH 07/58] Finish test fixup

---
 src/chai/ManagedArrayView.hpp          |   3 +-
 src/chai/RajaExecutionSpacePlugin.cpp  |   1 -
 tests/integration/CMakeLists.txt       |  41 ++-
 tests/integration/chai-nested.cpp      | 420 -------------------------
 tests/integration/raja-chai-nested.cpp | 255 +++++++++++++++
 tests/integration/raja-chai-tests.cpp  | 117 +++----
 6 files changed, 333 insertions(+), 504 deletions(-)
 delete mode 100644 tests/integration/chai-nested.cpp
 create mode 100644 tests/integration/raja-chai-nested.cpp

diff --git a/src/chai/ManagedArrayView.hpp b/src/chai/ManagedArrayView.hpp
index afce47dd..2d53b522 100644
--- a/src/chai/ManagedArrayView.hpp
+++ b/src/chai/ManagedArrayView.hpp
@@ -48,11 +48,12 @@
 #if defined(CHAI_ENABLE_RAJA_PLUGIN)
 
 #include "chai/ManagedArray.hpp"
+
 #include "RAJA/util/View.hpp"
 
 namespace chai {
 
-template <typename ValueType, typename LayoutType>
+  template <typename ValueType, typename LayoutType>
 using ManagedArrayView =
     RAJA::View<ValueType, LayoutType, chai::ManagedArray<ValueType>>;
 
diff --git a/src/chai/RajaExecutionSpacePlugin.cpp b/src/chai/RajaExecutionSpacePlugin.cpp
index e46279da..71fe7716 100644
--- a/src/chai/RajaExecutionSpacePlugin.cpp
+++ b/src/chai/RajaExecutionSpacePlugin.cpp
@@ -66,7 +66,6 @@ RajaExecutionSpacePlugin::preLaunch(RAJA::util::PluginContext p)
     default: 
       m_arraymanager->setExecutionSpace(chai::NONE);
   }
-
 }
 
 void 
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 64c2dfe9..03e6fd61 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -32,19 +32,34 @@ blt_add_test(
   NAME managed_array_test
   COMMAND managed_array_tests)
 
-set(raja_test_depends
-  ${managed_array_test_depends}
-  raja)
+if (ENABLE_RAJA_PLUGIN)
+  set(raja_test_depends
+    ${managed_array_test_depends}
+    raja)
 
-blt_add_executable(
-  NAME raja-chai-tests 
-  SOURCES raja-chai-tests.cpp
-  DEPENDS_ON ${raja_test_depends})
+  blt_add_executable(
+    NAME raja-chai-tests 
+    SOURCES raja-chai-tests.cpp
+    DEPENDS_ON ${raja_test_depends})
 
-blt_add_test(
-  NAME raja-chai-tests
-  COMMAND raja-chai-tests)
+  blt_add_test(
+    NAME raja-chai-tests
+    COMMAND raja-chai-tests)
 
-target_include_directories(
-  raja-chai-tests
-  PUBLIC ${PROJECT_BINARY_DIR}/include)
+  target_include_directories(
+    raja-chai-tests
+    PUBLIC ${PROJECT_BINARY_DIR}/include)
+
+  blt_add_executable(
+    NAME raja-chai-nested-tests 
+    SOURCES raja-chai-nested.cpp
+    DEPENDS_ON ${raja_test_depends})
+
+  blt_add_test(
+    NAME raja-chai-nested-tests
+    COMMAND raja-chai-nested-tests)
+
+  target_include_directories(
+    raja-chai-nested-tests
+    PUBLIC ${PROJECT_BINARY_DIR}/include)
+endif ()
diff --git a/tests/integration/chai-nested.cpp b/tests/integration/chai-nested.cpp
deleted file mode 100644
index 97790808..00000000
--- a/tests/integration/chai-nested.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For details about use and distribution, please read RAJA/LICENSE.
-//
-//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-
-///
-/// Source file containing tests for CHAI in RAJA nested loops.
-///
-///
-#include <time.h>
-#include <cfloat>
-#include <cstdlib>
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "RAJA/RAJA.hpp"
-#include "RAJA_gtest.hpp"
-
-using namespace RAJA;
-using namespace std;
-
-#include "chai/ArrayManager.hpp"
-#include "chai/ManagedArray.hpp"
-
-/*
- * Simple tests using forallN and View
- */
-CUDA_TEST(Chai, NestedSimpleOld) {
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_y_exec > > POLICY_GPU;
-
-  const int X = 16;
-  const int Y = 16;
-
-  chai::ManagedArray<float> v1(X*Y);
-  chai::ManagedArray<float> v2(X*Y);
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      int index = j*X + i;
-      v1[index] = index;
-  });
-
-  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
-      int index = j*X + i;
-      v2[index] = v1[index]*2.0f;
-  });
-  cudaDeviceSynchronize();
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      int index = j*X + i;
-      ASSERT_FLOAT_EQ(v1[index], index*1.0f);
-      ASSERT_FLOAT_EQ(v2[index], index*2.0f);
-  });
-}
-
-
-/*
- * Simple tests using nested::forall and View
- */
-CUDA_TEST(Chai, NestedSimple) {
-  typedef RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-        RAJA::statement::For<1, RAJA::seq_exec> > > POLICY;
-  typedef RAJA::KernelPolicy<
-      RAJA::statement::For<0, RAJA::seq_exec,
-        RAJA::statement::CudaKernel<
-          RAJA::statement::For<1, RAJA::cuda_threadblock_exec<32> > > > >POLICY_GPU;
-
-  const int X = 16;
-  const int Y = 16;
-
-  chai::ManagedArray<float> v1(X*Y);
-  chai::ManagedArray<float> v2(X*Y);
-
-  RAJA::kernel<POLICY>(
-
-      RAJA::make_tuple(RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,X) ),
-
-      [=] (int i, int j) {
-        int index = j*X + i;
-        v1[index] = index;
-  });
-
-  RAJA::kernel<POLICY_GPU>(
-
-      RAJA::make_tuple(RangeSegment(0,Y), RangeSegment(0,X) ),
-
-      [=] __host__ __device__ (int i, int j) {
-        int index = j*X + i;
-        v2[index] = v1[index]*2.0f;
-  });
-
-  cudaDeviceSynchronize();
-
-  RAJA::kernel<POLICY>(
-
-      RAJA::make_tuple(RAJA::RangeSegment(0,Y), RAJA::RangeSegment(0,X) ),
-
-      [=] (int i, int j) {
-        int index = j*X + i;
-        ASSERT_FLOAT_EQ(v1[index], index*1.0f);
-        ASSERT_FLOAT_EQ(v2[index], index*2.0f);
-  });
-}
-
-CUDA_TEST(Chai, NestedView) {
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_y_exec > > POLICY_GPU;
-
-  const int X = 16;
-  const int Y = 16;
-
-  chai::ManagedArray<float> v1_array(X*Y);
-  chai::ManagedArray<float> v2_array(X*Y);
-
-  typedef chai::ManagedArrayView<float, RAJA::Layout<2> > view;
-
-  view v1(v1_array, X, Y);
-  view v2(v2_array, X, Y);
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      v1(i,j) = (i+(j*X)) * 1.0f;
-  });
-
-  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
-      v2(i,j) = v1(i,j)*2.0f;
-  });
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      ASSERT_FLOAT_EQ(v2(i,j), v1(i,j)*2.0f);
-  });
-}
-
-CUDA_TEST(Chai, NestedView2) {
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::seq_exec> > POLICY;
-
-#if defined (RAJA_ENABLE_OPENMP)
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::omp_for_nowait_exec, RAJA::cuda_thread_x_exec >, RAJA::OMP_Parallel<> > POLICY_GPU;
-#else
-  typedef RAJA::NestedPolicy< RAJA::ExecList< RAJA::seq_exec, RAJA::cuda_thread_x_exec > > POLICY_GPU;
-#endif
-
-  const int X = 16;
-  const int Y = 16;
-
-  chai::ManagedArray<float> v1_array(X*Y);
-  chai::ManagedArray<float> v2_array(X*Y);
-
-  typedef RAJA::ManagedArrayView<float, RAJA::Layout<2> > view;
-
-  view v1(v1_array, X, Y);
-  view v2(v2_array, X, Y);
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      v1(i,j) = (i+(j*X)) * 1.0f;
-  });
-
-  RAJA::forallN<POLICY_GPU>(RangeSegment(0,Y), RangeSegment(0,X), [=] __device__ (int i, int j) {
-      v2(i,j) = v1(i,j)*2.0f;
-  });
-
-  RAJA::forallN<POLICY>(RangeSegment(0,Y), RangeSegment(0,X), [=] (int i, int j) {
-      ASSERT_FLOAT_EQ(v2(i,j), v1(i,j)*2.0f);
-  });
-}
-
-///////////////////////////////////////////////////////////////////////////
-//
-// Example LTimes kernel test routines
-//
-// Demonstrates a 4-nested loop, the use of complex nested policies and
-// the use of strongly-typed indices
-//
-// This routine computes phi(m, g, z) = SUM_d {  ell(m, d)*psi(d,g,z)  }
-//
-///////////////////////////////////////////////////////////////////////////
-typedef struct {
-  double val;
-  int idx;
-} minmaxloc_t;
-
-// block_size is needed by the reduction variables to setup shared memory
-// Care should be used here to cover the maximum block dimensions used by this
-// test
-const size_t block_size = 256;
-
-RAJA_INDEX_VALUE(IMoment, "IMoment");
-RAJA_INDEX_VALUE(IDirection, "IDirection");
-RAJA_INDEX_VALUE(IGroup, "IGroup");
-RAJA_INDEX_VALUE(IZone, "IZone");
-
-template <typename POL>
-void runLTimesTest(std::string const &policy,
-                   Index_type num_moments,
-                   Index_type num_directions,
-                   Index_type num_groups,
-                   Index_type num_zones)
-{
-  // cout << "\n TestLTimes " << num_moments << " moments, " << num_directions
-  //      << " directions, " << num_groups << " groups, and " << num_zones
-  //      << " zones"
-  //      << " with policy " << policy << endl;
-
-  // allocate data
-  // phi is initialized to all zeros, the others are randomized
-  chai::ManagedArray<double> ell_data(num_moments * num_directions);
-  chai::ManagedArray<double> psi_data(num_directions * num_groups * num_zones);
-  //chai::ManagedArray<double> phi_data(num_moments * num_groups * num_zones, 0.0);
-  chai::ManagedArray<double> phi_data(num_moments * num_groups * num_zones);
-
-  // setup CUDA Reduction variables to be exercised
-  ReduceSum<cuda_reduce<block_size>, double> pdsum(0.0);
-  ReduceMin<cuda_reduce<block_size>, double> pdmin(DBL_MAX);
-  ReduceMax<cuda_reduce<block_size>, double> pdmax(-DBL_MAX);
-  ReduceMinLoc<cuda_reduce<block_size>, double> pdminloc(DBL_MAX, -1);
-  ReduceMaxLoc<cuda_reduce<block_size>, double> pdmaxloc(-DBL_MAX, -1);
-
-
-  // data setup using RAJA to ensure that chai is activated
-  RAJA::forall<RAJA::seq_exec>(0, (num_moments*num_directions), [=] (int i) {
-    ell_data[i] = drand48();
-  });
-
-  RAJA::forall<RAJA::seq_exec>(0, (num_directions*num_groups*num_zones), [=] (int i) {
-    psi_data[i] = drand48();
-  });
-
-  RAJA::forall<RAJA::seq_exec>(0, (num_moments*num_groups*num_zones), [=] (int i) {
-    phi_data[i] = 0.0;
-  });
-
-  typename POL::ELL_VIEW ell(ell_data, RAJA::make_permuted_layout({num_moments, num_directions}, RAJA::as_array<typename POL::ELL_PERM>::get()));
-  typename POL::PSI_VIEW psi(psi_data, RAJA::make_permuted_layout({num_directions, num_groups, num_zones}, RAJA::as_array<typename POL::PSI_PERM>::get()));
-  typename POL::PHI_VIEW phi(phi_data, RAJA::make_permuted_layout({num_moments, num_groups, num_zones}, RAJA::as_array<typename POL::PHI_PERM>::get()));
-
-  using EXEC = typename POL::EXEC;
-
-  // do calculation using RAJA
-  forallN<EXEC, IMoment, IDirection, IGroup, IZone>(
-      RangeSegment(0, num_moments),
-      RangeSegment(0, num_directions),
-      RangeSegment(0, num_groups),
-      RangeSegment(0, num_zones),
-      [=] __device__(IMoment m, IDirection d, IGroup g, IZone z) {
-        double val = ell(m, d) * psi(d, g, z);
-        phi(m, g, z) += val;
-        pdsum += val;
-        pdmin.min(val);
-        pdmax.max(val);
-
-        int index = *d + (*m * num_directions)
-                    + (*g * num_directions * num_moments)
-                    + (*z * num_directions * num_moments * num_groups);
-
-        pdminloc.minloc(val, index);
-        pdmaxloc.maxloc(val, index);
-      });
-
-  cudaDeviceSynchronize();
-
-  // Make sure data is copied to host for checking results.
-  chai::ArrayManager* rm = chai::ArrayManager::getInstance();
-  rm->setExecutionSpace(chai::CPU);
-  // setup local Reduction variables as a crosscheck
-  double the_lsum = 0.0;
-  double the_lmin = DBL_MAX;
-  double the_lmax = -DBL_MAX;
-
-  double* lsum = &the_lsum;
-  double* lmin = &the_lmin;
-  double* lmax = &the_lmax;
-
-  forall<RAJA::seq_exec>(RangeSegment(0, num_zones), [=] (int z) {
-    for (IGroup g(0); g < num_groups; ++g) {
-      for (IMoment m(0); m < num_moments; ++m) {
-        double total = 0.0;
-        for (IDirection d(0); d < num_directions; ++d) {
-          double val = ell(m, d) * psi(d, g, IZone(z));
-          total += val;
-          *lmin = RAJA_MIN(*lmin, val);
-          *lmax = RAJA_MAX(*lmax, val);
-          int index = *d + (*m * num_directions)
-                      + (*g * num_directions * num_moments)
-                      + (z * num_directions * num_moments * num_groups);
-        }
-        *lsum += total;
-
-        // check answer with some reasonable tolerance
-        ASSERT_FLOAT_EQ(total, phi(m, g, IZone(z)));
-      }
-    }
-  });
-
-  rm->setExecutionSpace(chai::NONE);
-
-  ASSERT_FLOAT_EQ(*lsum, pdsum.get());
-  ASSERT_FLOAT_EQ(*lmin, pdmin.get());
-  ASSERT_FLOAT_EQ(*lmax, pdmax.get());
-}
-
-// Use thread-block mappings
-struct PolLTimesA_GPU {
-  // Loops: Moments, Directions, Groups, Zones
-  typedef NestedPolicy<ExecList<seq_exec,
-                                seq_exec,
-                                cuda_threadblock_x_exec<32>,
-                                cuda_threadblock_y_exec<32>>>
-      EXEC;
-
-  // psi[direction, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
-      PSI_VIEW;
-
-  // phi[moment, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
-      PHI_VIEW;
-
-  // ell[moment, direction]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
-      ELL_VIEW;
-
-  typedef RAJA::PERM_IJK PSI_PERM;
-  typedef RAJA::PERM_IJK PHI_PERM;
-  typedef RAJA::PERM_IJ ELL_PERM;
-};
-
-// Use thread and block mappings
-struct PolLTimesB_GPU {
-  // Loops: Moments, Directions, Groups, Zones
-  typedef NestedPolicy<ExecList<seq_exec,
-                                seq_exec,
-                                cuda_thread_z_exec,
-                                cuda_block_y_exec>,
-                       Permute<PERM_IJKL>>
-      EXEC;
-
-  // psi[direction, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
-      PSI_VIEW;
-
-  // phi[moment, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
-      PHI_VIEW;
-
-  // ell[moment, direction]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
-      ELL_VIEW;
-
-  typedef RAJA::PERM_IJK PSI_PERM;
-  typedef RAJA::PERM_IJK PHI_PERM;
-  typedef RAJA::PERM_IJ ELL_PERM;
-};
-
-// Combine OMP Parallel, omp nowait, and cuda thread-block launch
-struct PolLTimesC_GPU {
-  // Loops: Moments, Directions, Groups, Zones
-#if defined(RAJA_ENABLE_OPENMP)
-  typedef NestedPolicy<ExecList<seq_exec,
-                                seq_exec,
-                                omp_for_nowait_exec,
-                                cuda_threadblock_y_exec<32>>,
-                       OMP_Parallel<>>
-      EXEC;
-#else
-  typedef NestedPolicy<ExecList<seq_exec,
-                                seq_exec,
-                                seq_exec,
-                                cuda_threadblock_y_exec<32>> >
-      EXEC;
-#endif
-
-  // psi[direction, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IDirection, IGroup, IZone>
-      PSI_VIEW;
-
-  // phi[moment, group, zone]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<3>, IMoment, IGroup, IZone>
-      PHI_VIEW;
-
-  // ell[moment, direction]
-  typedef RAJA::TypedManagedArrayView<double, RAJA::Layout<2>, IMoment, IDirection>
-      ELL_VIEW;
-
-  typedef RAJA::PERM_IJK PSI_PERM;
-  typedef RAJA::PERM_IJK PHI_PERM;
-  typedef RAJA::PERM_IJ ELL_PERM;
-};
-
-void runLTimesTests(Index_type num_moments,
-                    Index_type num_directions,
-                    Index_type num_groups,
-                    Index_type num_zones)
-{
-  runLTimesTest<PolLTimesA_GPU>(
-      "PolLTimesA_GPU", num_moments, num_directions, num_groups, num_zones);
-  runLTimesTest<PolLTimesB_GPU>(
-      "PolLTimesB_GPU", num_moments, num_directions, num_groups, num_zones);
-  runLTimesTest<PolLTimesC_GPU>(
-      "PolLTimesC_GPU", num_moments, num_directions, num_groups, num_zones);
-}
-
-TEST(Chai, Nested) {
-//  runLTimesTests(2, 0, 7, 3);
-  runLTimesTests(2, 3, 7, 3);
-  runLTimesTests(2, 3, 32, 4);
-  runLTimesTests(25, 96, 8, 32);
-  runLTimesTests(100, 15, 7, 13);
-}
diff --git a/tests/integration/raja-chai-nested.cpp b/tests/integration/raja-chai-nested.cpp
new file mode 100644
index 00000000..5caee295
--- /dev/null
+++ b/tests/integration/raja-chai-nested.cpp
@@ -0,0 +1,255 @@
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: (BSD-3-Clause)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
+
+///
+/// Source file containing tests for CHAI in RAJA nested loops.
+///
+///
+#include <time.h>
+#include <cfloat>
+#include <cstdlib>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "RAJA/RAJA.hpp"
+
+using namespace RAJA;
+using namespace std;
+
+#include "chai/ArrayManager.hpp"
+#include "chai/ManagedArrayView.hpp"
+#include "chai/ManagedArray.hpp"
+
+#include "gtest/gtest.h"
+
+#define CUDA_TEST(X, Y)                 \
+  static void cuda_test_##X##_##Y();    \
+  TEST(X, Y) { cuda_test_##X##_##Y(); } \
+  static void cuda_test_##X##_##Y()
+
+/*
+ * Simple tests using nested::forall and View
+ */
+CUDA_TEST(Chai, NestedSimple)
+{
+  using POLICY = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec, 
+        RAJA::statement::For<1, RAJA::seq_exec
+        >
+      >
+    >;
+
+  using POLICY_GPU = RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >;
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1(X * Y);
+  chai::ManagedArray<float> v2(X * Y);
+
+  RAJA::kernel<POLICY>(
+
+      RAJA::make_tuple(RAJA::RangeSegment(0, Y), RAJA::RangeSegment(0, X)),
+
+      [=](int i, int j) {
+        int index = j * X + i;
+        v1[index] = index;
+      });
+
+  RAJA::kernel<POLICY_GPU>(
+      RAJA::make_tuple(RangeSegment(0, Y), RangeSegment(0, X)),
+
+      [=] __host__ __device__(int i, int j) {
+        int index = j * X + i;
+        v2[index] = v1[index] * 2.0f;
+      });
+
+  cudaDeviceSynchronize();
+
+  RAJA::kernel<POLICY>(
+
+      RAJA::make_tuple(RAJA::RangeSegment(0, Y), RAJA::RangeSegment(0, X)),
+
+      [=](int i, int j) {
+        int index = j * X + i;
+        ASSERT_FLOAT_EQ(v1[index], index * 1.0f);
+        ASSERT_FLOAT_EQ(v2[index], index * 2.0f);
+      });
+}
+
+CUDA_TEST(Chai, NestedView)
+{
+  using POLICY = 
+    RAJA::KernelPolicy<
+      RAJA::statement::For<0, RAJA::seq_exec, 
+        RAJA::statement::For<1, RAJA::seq_exec
+        >
+      >
+    >;
+
+  using POLICY_GPU = 
+    RAJA::KernelPolicy<
+      RAJA::statement::CudaKernel<
+        RAJA::statement::For<1, RAJA::cuda_block_x_loop,
+          RAJA::statement::For<0, RAJA::cuda_thread_x_loop
+          >
+        >
+      >
+    >;
+
+  const int X = 16;
+  const int Y = 16;
+
+  chai::ManagedArray<float> v1_array(X * Y);
+  chai::ManagedArray<float> v2_array(X * Y);
+
+  using view = chai::ManagedArrayView<float, RAJA::Layout<2>>;
+
+  view v1(v1_array, X, Y);
+  view v2(v2_array, X, Y);
+
+  RAJA::kernel<POLICY>(RAJA::make_tuple(RangeSegment(0, Y), RangeSegment(0, X)),
+                        [=](int i, int j) { v1(i, j) = (i + (j * X)) * 1.0f; });
+
+  RAJA::kernel<POLICY_GPU>(RAJA::make_tuple(RangeSegment(0, Y), RangeSegment(0, X)),
+                            [=] __device__(int i, int j) {
+                              v2(i, j) = v1(i, j) * 2.0f;
+                            });
+
+  RAJA::kernel<POLICY>(RAJA::make_tuple(RangeSegment(0, Y), RangeSegment(0, X)),
+                        [=](int i, int j) {
+                          ASSERT_FLOAT_EQ(v2(i, j), v1(i, j) * 2.0f);
+                        });
+}
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Example LTimes kernel test routines
+//
+// Demonstrates a 4-nested loop, the use of complex nested policies and
+// the use of strongly-typed indices
+//
+// This routine computes phi(m, g, z) = SUM_d {  ell(m, d)*psi(d,g,z)  }
+//
+///////////////////////////////////////////////////////////////////////////
+
+RAJA_INDEX_VALUE_T(IM, int, "IM");
+RAJA_INDEX_VALUE_T(ID, int, "ID");
+RAJA_INDEX_VALUE_T(IG, int, "IG");
+RAJA_INDEX_VALUE_T(IZ, int, "IZ");
+
+void runLTimesTests(Index_type num_moments,
+                   Index_type num_directions,
+                   Index_type num_groups,
+                   Index_type num_zones)
+{
+  // allocate data
+  // phi is initialized to all zeros, the others are randomized
+  chai::ManagedArray<double> L_data(num_moments * num_directions);
+  chai::ManagedArray<double> psi_data(num_directions * num_groups * num_zones);
+  chai::ManagedArray<double> phi_data(num_moments * num_groups * num_zones);
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, (num_moments * num_directions)),
+    [=](int i) {
+      L_data[i] = i+2;
+  });
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, (num_directions * num_groups * num_zones)),
+    [=](int i) { psi_data[i] = 2*i+1; });
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::RangeSegment(0, (num_moments * num_groups * num_zones)),
+    [=](int i) { phi_data[i] = 0.0; });
+
+  using LView = chai::TypedManagedArrayView<double, Layout<2, Index_type, 1>, IM, ID>;
+
+  // psi(d, g, z) : 2 -> z is stride-1 dimension 
+  using PsiView = chai::TypedManagedArrayView<double, Layout<3, Index_type, 2>, ID, IG, IZ>;
+
+  // phi(m, g, z) : 2 -> z is stride-1 dimension 
+  using PhiView = chai::TypedManagedArrayView<double, Layout<3, Index_type, 2>, IM, IG, IZ>;
+
+  std::array<RAJA::idx_t, 2> L_perm {{0, 1}};
+  LView L(L_data,
+          RAJA::make_permuted_layout({{num_moments, num_directions}}, L_perm));
+
+  std::array<RAJA::idx_t, 3> psi_perm {{0, 1, 2}};
+  PsiView psi(psi_data,
+              RAJA::make_permuted_layout({{num_directions, num_groups, num_zones}}, psi_perm));
+
+  std::array<RAJA::idx_t, 3> phi_perm {{0, 1, 2}};
+  PhiView phi(phi_data,
+              RAJA::make_permuted_layout({{num_moments, num_groups, num_zones}}, phi_perm));
+
+  using EXECPOL =
+    RAJA::KernelPolicy<
+      statement::CudaKernelAsync<    
+        statement::For<0, cuda_block_x_loop,  // m
+          statement::For<2, cuda_block_y_loop,  // g
+            statement::For<3, cuda_thread_x_loop,  // z
+              statement::For<1, seq_exec,  // d
+                statement::Lambda<0>
+              >
+            >
+          >
+        >
+      >         
+    >;          
+                 
+  auto segments = RAJA::make_tuple(RAJA::TypedRangeSegment<IM>(0, num_moments),
+                                   RAJA::TypedRangeSegment<ID>(0, num_directions),
+                                   RAJA::TypedRangeSegment<IG>(0, num_groups),
+                                   RAJA::TypedRangeSegment<IZ>(0, num_zones));
+
+  cudaErrchk( cudaDeviceSynchronize() );
+
+  RAJA::kernel<EXECPOL>( segments,
+    [=] RAJA_DEVICE (IM m, ID d, IG g, IZ z) {
+       phi(m, g, z) += L(m, d) * psi(d, g, z);
+    }
+  );
+
+  cudaErrchk( cudaDeviceSynchronize() );
+
+  RAJA::forall<RAJA::seq_exec>(
+    RAJA::TypedRangeSegment<IM>(0, num_moments), [=] (IM m) {
+    for (IG g(0); g < num_groups; ++g) {
+      for (IZ z(0); z < num_zones; ++z) {
+        double total = 0.0;
+        for (ID d(0); d < num_directions; ++d) {
+          double val = L(m, d) * psi(d, g, z);
+          total += val;
+        }
+        ASSERT_FLOAT_EQ(total, phi(m, g, z));
+      }
+    }
+  });
+
+  //rm->setExecutionSpace(chai::NONE);
+}
+
+TEST(Chai, LTimes)
+{
+  //  runLTimesTests(2, 0, 7, 3);
+  runLTimesTests(2, 3, 7, 3);
+  runLTimesTests(2, 3, 32, 4);
+  runLTimesTests(25, 96, 8, 32);
+  runLTimesTests(100, 15, 7, 13);
+}
diff --git a/tests/integration/raja-chai-tests.cpp b/tests/integration/raja-chai-tests.cpp
index 713dd1ec..c1e9416c 100644
--- a/tests/integration/raja-chai-tests.cpp
+++ b/tests/integration/raja-chai-tests.cpp
@@ -1,128 +1,107 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
-// Copyright (c) 2016-18, Lawrence Livermore National Security, LLC.
-//
-// Produced at the Lawrence Livermore National Laboratory
-//
-// LLNL-CODE-689114
-//
-// All rights reserved.
-//
-// This file is part of RAJA.
-//
-// For details about use and distribution, please read RAJA/LICENSE.
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC
+// and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
 //
+// SPDX-License-Identifier: (BSD-3-Clause)
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 ///
 /// Source file containing tests for CHAI with basic RAJA constructs
 ///
-
-#include "gtest/gtest.h"
+#include "RAJA/RAJA.hpp"
 
 #include "chai/ManagedArray.hpp"
-#include "chai/RajaExecutionSpacePlugin.hpp"
+#include "chai/ManagedArrayView.hpp"
 
-#define CUDA_TEST(X, Y) \
-static void cuda_test_ ## X ## Y();\
-TEST(X,Y) { cuda_test_ ## X ## Y();}\
-static void cuda_test_ ## X ## Y()
-
-#include "RAJA/RAJA.hpp"
+#include <iostream>
 
-// // Register plugin with RAJA
-// static RAJA::util::PluginRegistry::Add<chai::RajaExecutionSpacePlugin> P(
-//     "RajaExecutionSpacePlugin",
- //    "Plugin to set CHAI execution space based on RAJA execution platform");
+#include "gtest/gtest.h"
 
+#define CUDA_TEST(X, Y)                 \
+  static void cuda_test_##X##_##Y();    \
+  TEST(X, Y) { cuda_test_##X##_##Y(); } \
+  static void cuda_test_##X##_##Y()
 
-#include <iostream>
-
-CUDA_TEST(ChaiTest, Simple) {
+CUDA_TEST(ChaiTest, Simple)
+{
   chai::ManagedArray<float> v1(10);
   chai::ManagedArray<float> v2(10);
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=] (int i) {
-      v1[i] = static_cast<float>(i * 1.0f);
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+    v1[i] = static_cast<float>(i * 1.0f);
   });
 
   std::cout << "end of loop 1" << std::endl;
 
 
-#if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__ (int i) {
-      v2[i] = v1[i]*2.0f;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__(int i) {
+    v2[i] = v1[i] * 2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(RAJA::RangeSegment(0, 10), [=] (int i) {
-      v2[i] = v1[i]*2.0f;
-  });
+  RAJA::forall<RAJA::omp_for_exec>(RAJA::RangeSegment(0, 10), [=](int i) { v2[i] = v1[i] * 2.0f; });
 #endif
 
   std::cout << "end of loop 2" << std::endl;
 
-  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=] (int i) {
-      ASSERT_FLOAT_EQ(v2[i], i*2.0f);
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+    ASSERT_FLOAT_EQ(v2[i], i * 2.0f);
   });
 
 
-#if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__ (int i) {
-      v2[i] *= 2.0f;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__(int i) {
+    v2[i] *= 2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(RAJA::RangeSegment(0, 10), [=] (int i) {
-      v2[i] *= 2.0f;
-  });
+  RAJA::forall<RAJA::omp_for_exec>(RAJA::RangeSegment(0, 10), [=](int i) { v2[i] *= 2.0f; });
 #endif
 
-  float * raw_v2 = v2;
-  for (int i = 0; i < 10; i++ ) {
-      ASSERT_FLOAT_EQ(raw_v2[i], i*2.0f*2.0f);;
+  float* raw_v2 = v2;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_FLOAT_EQ(raw_v2[i], i * 2.0f * 2.0f);
+    ;
   }
 }
 
-#if 0
-CUDA_TEST(ChaiTest, Views) {
+CUDA_TEST(ChaiTest, Views)
+{
   chai::ManagedArray<float> v1_array(10);
   chai::ManagedArray<float> v2_array(10);
 
-  typedef RAJA::ManagedArrayView<float, RAJA::Layout<1> > view;
+  using view = chai::ManagedArrayView<float, RAJA::Layout<1> >;
 
   view v1(v1_array, 10);
   view v2(v2_array, 10);
 
-  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
-      v1(i) = static_cast<float>(i * 1.0f);
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+    v1(i) = static_cast<float>(i * 1.0f);
   });
 
-#if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
-      v2(i) = v1(i)*2.0f;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__(int i) {
+    v2(i) = v1(i) * 2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=](int i) {
-      v2(i) = v1(i)*2.0f;
-  });
+  RAJA::forall<RAJA::omp_for_exec>(RAJA::RangeSegment(0, 10), [=](int i) { v2(i) = v1(i) * 2.0f; });
 #endif
 
-  RAJA::forall<RAJA::seq_exec>(0, 10, [=] (int i) {
-      ASSERT_FLOAT_EQ(v2(i), i*2.0f);
+  RAJA::forall<RAJA::seq_exec>(RAJA::RangeSegment(0, 10), [=](int i) {
+    ASSERT_FLOAT_EQ(v2(i), i * 2.0f);
   });
 
 
-#if defined(CHAI_ENABLE_CUDA)
-  RAJA::forall<RAJA::cuda_exec<16> >(0, 10, [=] __device__ (int i) {
-      v2(i) *= 2.0f;
+#if defined(RAJA_ENABLE_CUDA)
+  RAJA::forall<RAJA::cuda_exec<16> >(RAJA::RangeSegment(0, 10), [=] __device__(int i) {
+    v2(i) *= 2.0f;
   });
 #else
-  RAJA::forall<RAJA::omp_for_exec >(0, 10, [=](int i) {
-      v2(i) *= 2.0f;
-  });
+  RAJA::forall<RAJA::omp_for_exec>(RAJA::RangeSegment(0, 10), [=](int i) { v2(i) *= 2.0f; });
 #endif
 
-  float * raw_v2 = v2.data;
-  for (int i = 0; i < 10; i++ ) {
-      ASSERT_FLOAT_EQ(raw_v2[i], i*1.0f*2.0f*2.0f);;
+  float* raw_v2 = v2.data;
+  for (int i = 0; i < 10; i++) {
+    ASSERT_FLOAT_EQ(raw_v2[i], i * 1.0f * 2.0f * 2.0f);
+    ;
   }
 }
-#endif

From ba44bcdb3c8ca2e0fa7ef38f609834a9041dfc65 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 12 Sep 2019 17:54:49 -0700
Subject: [PATCH 08/58] Add ability to control callbacks

---
 src/chai/ArrayManager.cpp | 21 ++++++++++++---------
 src/chai/ArrayManager.hpp | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp
index ec16cde8..54a32176 100644
--- a/src/chai/ArrayManager.cpp
+++ b/src/chai/ArrayManager.cpp
@@ -61,7 +61,8 @@ ArrayManager* ArrayManager::getInstance()
 ArrayManager::ArrayManager() :
   m_pointer_map{},
   m_allocators{},
-  m_resource_manager{umpire::ResourceManager::getInstance()}
+  m_resource_manager{umpire::ResourceManager::getInstance()},
+  m_callbacks_active{true}
 {
   m_pointer_map.clear();
   m_current_execution_space = NONE;
@@ -196,7 +197,7 @@ void ArrayManager::move(PointerRecord* record, ExecutionSpace space)
   if (!record->m_touched[record->m_last_space]) {
     return;
   } else {
-    record->m_user_callback(ACTION_MOVE, space, record->m_size);
+    callback(record, ACTION_MOVE, space, record->m_size);
     std::lock_guard<std::mutex> lock(m_mutex);
     m_resource_manager.copy(dst_pointer, src_pointer);
   }
@@ -211,7 +212,7 @@ void ArrayManager::allocate(
   auto size = pointer_record->m_size;
   auto alloc = m_resource_manager.getAllocator(pointer_record->m_allocators[space]);
 
-  pointer_record->m_user_callback(ACTION_ALLOC, space, size);
+  callback(pointer_record, ACTION_ALLOC, space, size);
   pointer_record->m_pointers[space] =  alloc.allocate(size);
 
   registerPointer(pointer_record, space);
@@ -229,9 +230,10 @@ void ArrayManager::free(PointerRecord* pointer_record)
         void* space_ptr = pointer_record->m_pointers[space];
 #if defined(CHAI_ENABLE_UM)
         if (space_ptr == pointer_record->m_pointers[UM]) {
-          pointer_record->m_user_callback(ACTION_FREE,
-                                          ExecutionSpace(UM),
-                                          pointer_record->m_size);
+          callback(pointer_record,
+                   ACTION_FREE,
+                   ExecutionSpace(UM),
+                   pointer_record->m_size);
           {
             std::lock_guard<std::mutex> lock(m_mutex);
             m_pointer_map.erase(space_ptr);
@@ -247,9 +249,10 @@ void ArrayManager::free(PointerRecord* pointer_record)
           }
         } else {
 #endif
-          pointer_record->m_user_callback(ACTION_FREE,
-                                          ExecutionSpace(space),
-                                          pointer_record->m_size);
+          callback(pointer_record,
+                   ACTION_FREE,
+                   ExecutionSpace(space),
+                   pointer_record->m_size);
           {
             std::lock_guard<std::mutex> lock(m_mutex);
             m_pointer_map.erase(space_ptr);
diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index 7afc21ab..82221d24 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -258,6 +258,16 @@ class ArrayManager
 
   int getAllocatorId(ExecutionSpace space) const;
 
+  /*!
+   * \brief Turn callbacks on.
+   */
+  void enableCallbacks() { m_callbacks_active = true; }
+
+  /*!
+   * \brief Turn callbacks off.
+   */
+  void disableCallbacks() { m_callbacks_active = false; }
+
 protected:
   /*!
    * \brief Construct a new ArrayManager.
@@ -293,6 +303,23 @@ class ArrayManager
    */
   void move(PointerRecord* record, ExecutionSpace space);
 
+  /*!
+   * \brief Execute a user callback if callbacks are active
+   *
+   * \param record The pointer record containing the callback
+   * \param action The event that occurred
+   * \param space The space in which the event occurred
+   * \param size The number of bytes in the array associated with this pointer record
+   */
+  inline void callback(PointerRecord* record,
+                       Action action,
+                       ExecutionSpace space,
+                       size_t size) const {
+     if (m_callbacks_active && record) {
+        record->m_user_callback(action, space, size);
+     }
+  }
+
   /*!
    * Current execution space.
    */
@@ -317,6 +344,11 @@ class ArrayManager
   umpire::ResourceManager& m_resource_manager;
 
   mutable std::mutex m_mutex;
+
+  /*!
+   * \brief Controls whether or not callbacks are called.
+   */
+  bool m_callbacks_active;
 };
 
 }  // end of namespace chai

From 2708a6003caee54fa922d29234fe3d6906f1a4da Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 13 Sep 2019 09:38:57 -0700
Subject: [PATCH 09/58] Add tests for controlling callbacks

---
 tests/unit/array_manager_unit_tests.cpp | 59 ++++++++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/tests/unit/array_manager_unit_tests.cpp b/tests/unit/array_manager_unit_tests.cpp
index 0aff35d9..eba731af 100644
--- a/tests/unit/array_manager_unit_tests.cpp
+++ b/tests/unit/array_manager_unit_tests.cpp
@@ -53,6 +53,7 @@ TEST(ArrayManager, Constructor)
 }
 
 #ifndef CHAI_DISABLE_RM
+
 TEST(ArrayManager, getPointerMap)
 {
   chai::ArrayManager* rm = chai::ArrayManager::getInstance();
@@ -108,4 +109,60 @@ TEST(ArrayManager, getPointerMap)
   ASSERT_EQ(rm->getTotalSize(),
             (sizeOfArray1 * sizeof(int)) + (sizeOfArray2 * sizeof(double)));
 }
-#endif
+
+TEST(ArrayManager, controlCallbacks)
+{
+  // First check that callbacks are turned on by default
+  chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+
+  // Variable for testing if callbacks are on or off
+  bool callbacksAreOn = false;
+
+  // Allocate one array and set a callback
+  size_t sizeOfArray = 5;
+  chai::ManagedArray<int> array1(sizeOfArray, chai::CPU);
+  array1.setUserCallback([&] (chai::Action, chai::ExecutionSpace, std::size_t) {
+                           callbacksAreOn = true;
+                         });
+
+  // Make sure the callback is called with ACTION_FREE
+  array1.free();
+  ASSERT_TRUE(callbacksAreOn);
+
+  // Now turn off callbacks
+  arrayManager->disableCallbacks();
+
+  // Reset the variable for testing if callbacks are on or off
+  callbacksAreOn = false;
+
+  // Allocate another array and set a callback
+  chai::ManagedArray<int> array2(sizeOfArray, chai::CPU);
+  array2.setUserCallback([&] (chai::Action, chai::ExecutionSpace, std::size_t) {
+                           callbacksAreOn = true;
+                         });
+
+  // Make sure the callback is called with ACTION_FREE
+  array2.free();
+  ASSERT_FALSE(callbacksAreOn);
+
+  // Now make sure the order doesn't matter for when the callback is set compared
+  // to when callbacks are enabled
+
+  // Reset the variable for testing if callbacks are on or off
+  callbacksAreOn = false;
+
+  // Allocate a third array and set a callback
+  chai::ManagedArray<int> array3(sizeOfArray, chai::CPU);
+  array3.setUserCallback([&] (chai::Action, chai::ExecutionSpace, std::size_t) {
+                           callbacksAreOn = true;
+                         });
+
+  // Turn on callbacks
+  arrayManager->enableCallbacks();
+
+  // Make sure the callback is called with ACTION_FREE
+  array3.free();
+  ASSERT_TRUE(callbacksAreOn);
+}
+
+#endif // !CHAI_DISABLE_RM

From 969e4bcad7a8dd42451539814287996d9ef19c04 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 13 Sep 2019 09:43:08 -0700
Subject: [PATCH 10/58] Add documentation for callback test

---
 tests/unit/array_manager_unit_tests.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/unit/array_manager_unit_tests.cpp b/tests/unit/array_manager_unit_tests.cpp
index eba731af..66edf586 100644
--- a/tests/unit/array_manager_unit_tests.cpp
+++ b/tests/unit/array_manager_unit_tests.cpp
@@ -110,6 +110,9 @@ TEST(ArrayManager, getPointerMap)
             (sizeOfArray1 * sizeof(int)) + (sizeOfArray2 * sizeof(double)));
 }
 
+/*!
+ * \brief Tests to see if callbacks can be turned on or off
+ */
 TEST(ArrayManager, controlCallbacks)
 {
   // First check that callbacks are turned on by default

From ef55f1057cee012fbe3fb1e3258f6fb55ecbc5ff Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Mon, 16 Sep 2019 15:10:02 -0700
Subject: [PATCH 11/58] Add docs links

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 959dc3bf..b45a98c6 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,8 @@
 
 [![Azure Build Status](https://dev.azure.com/davidbeckingsale/CHAI/_apis/build/status/LLNL.CHAI?branchName=develop)](https://dev.azure.com/davidbeckingsale/CHAI/_build/latest?definitionId=2&branchName=develop)
 [![Build Status](https://travis-ci.org/LLNL/CHAI.svg?branch=develop)](https://travis-ci.org/LLNL/CHAI)
+[![Documentation Status](https://readthedocs.org/projects/chai/badge/?version=develop)](https://chai.readthedocs.io/en/develop/?badge=develop)
+
 
 CHAI is a library that handles automatic data migration to different memory
 spaces behind an array-style interface. It was designed to work with
@@ -22,7 +24,7 @@ of CUDA was detected. Once CMake has completed, CHAI can be built with Make:
 
 For more advanced configuration you can use standard CMake variables.
 
-More information is available in the CHAI documentation.
+More information is available in the [CHAI documentation](https://chai.readthedocs.io/en/develop/).
 
 ## Authors
 

From d08ad65580645defc9275b860cf1825b461b47c5 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 10:02:32 -0700
Subject: [PATCH 12/58] Fix offsets in thin ManagedArray

---
 src/chai/ManagedArray_thin.inl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 9568af28..88c34dca 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -57,6 +57,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray():
   m_active_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(0),
+  m_offset(0),
   m_is_slice(false)
 {
 }
@@ -68,6 +69,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
   m_active_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(elems),
+  m_offset(0),
   m_is_slice(false)
 {
   this->allocate(elems, space);
@@ -79,6 +81,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
   m_active_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(0),
+  m_offset(0),
   m_is_slice(false)
 {
 }
@@ -90,6 +93,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other):
   m_active_pointer(other.m_active_pointer),
   m_resource_manager(other.m_resource_manager),
   m_elems(other.m_elems),
+  m_offset(other.m_offset),
   m_is_slice(other.m_is_slice)
 {
 }
@@ -100,6 +104,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data, ArrayManager* array_mana
   m_active_pointer(data), 
   m_resource_manager(array_manager),
   m_elems(elems),
+  m_offset(0),
   m_is_slice(false)
 {
 }

From 9dcc087ee1ecde1527c01ee00e37584180bb3cc5 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 10:08:51 -0700
Subject: [PATCH 13/58] Make sure m_active_base_pointer is set in thin version

---
 src/chai/ManagedArray_thin.inl | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 88c34dca..70529011 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -53,8 +53,9 @@ namespace chai {
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray():
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() :
   m_active_pointer(nullptr),
+  m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(0),
   m_offset(0),
@@ -65,8 +66,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray():
 template<typename T>
 CHAI_INLINE
 CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
-    size_t elems, ExecutionSpace space):
+    size_t elems, ExecutionSpace space) :
   m_active_pointer(nullptr),
+  m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(elems),
   m_offset(0),
@@ -79,6 +81,7 @@ template<typename T>
 CHAI_INLINE
 CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
   m_active_pointer(nullptr),
+  m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
   m_elems(0),
   m_offset(0),
@@ -89,8 +92,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other):
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) :
   m_active_pointer(other.m_active_pointer),
+  m_active_base_pointer(other.m_active_base_pointer),
   m_resource_manager(other.m_resource_manager),
   m_elems(other.m_elems),
   m_offset(other.m_offset),
@@ -102,6 +106,7 @@ template<typename T>
 CHAI_INLINE
 CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data, ArrayManager* array_manager, size_t elems, PointerRecord* pointer_record) :
   m_active_pointer(data), 
+  m_active_base_pointer(data),
   m_resource_manager(array_manager),
   m_elems(elems),
   m_offset(0),

From bd06267152ffecce6bef250075d446ec3353f0bf Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 10:12:51 -0700
Subject: [PATCH 14/58] Make sure pointer record gets set in thin version

---
 src/chai/ManagedArray_thin.inl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 70529011..a63b6ed7 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -57,6 +57,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() :
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
+  m_pointer_record(nullptr),
   m_elems(0),
   m_offset(0),
   m_is_slice(false)
@@ -70,6 +71,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
+  m_pointer_record(nullptr),
   m_elems(elems),
   m_offset(0),
   m_is_slice(false)
@@ -83,6 +85,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
+  m_pointer_record(nullptr),
   m_elems(0),
   m_offset(0),
   m_is_slice(false)
@@ -96,6 +99,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) :
   m_active_pointer(other.m_active_pointer),
   m_active_base_pointer(other.m_active_base_pointer),
   m_resource_manager(other.m_resource_manager),
+  m_pointer_record(other.m_pointer_record),
   m_elems(other.m_elems),
   m_offset(other.m_offset),
   m_is_slice(other.m_is_slice)
@@ -108,6 +112,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data, ArrayManager* array_mana
   m_active_pointer(data), 
   m_active_base_pointer(data),
   m_resource_manager(array_manager),
+  m_pointer_record(pointer_record),
   m_elems(elems),
   m_offset(0),
   m_is_slice(false)

From e909a3f52860cfede85149bd70a9bc3c95691d1b Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 10:34:39 -0700
Subject: [PATCH 15/58] Fix compiler warnings

---
 src/chai/ManagedArray.hpp      | 10 +++++-----
 src/chai/ManagedArray_thin.inl | 13 +++++++------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp
index 3bcabdad..c4593549 100644
--- a/src/chai/ManagedArray.hpp
+++ b/src/chai/ManagedArray.hpp
@@ -344,24 +344,24 @@ class ManagedArray : public CHAICopyable
   /*!
    * Currently active data pointer.
    */
-  mutable T* m_active_pointer;
-  mutable T* m_active_base_pointer;
+  mutable T* m_active_pointer = nullptr;
+  mutable T* m_active_base_pointer = nullptr;
 
   /*!
    * Pointer to ArrayManager instance.
    */
-  ArrayManager* m_resource_manager;
+  ArrayManager* m_resource_manager = nullptr;
 
   /*!
    * Number of elements in the ManagedArray.
    */
-  size_t m_elems;
+  size_t m_elems = 0;
   size_t m_offset = 0;
 
   /*!
    * Pointer to PointerRecord data.
    */
-  PointerRecord* m_pointer_record;
+  PointerRecord* m_pointer_record = nullptr;
  
   bool m_is_slice = false;
  
diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index a63b6ed7..25edc36f 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -57,9 +57,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() :
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
-  m_pointer_record(nullptr),
   m_elems(0),
   m_offset(0),
+  m_pointer_record(nullptr),
   m_is_slice(false)
 {
 }
@@ -71,9 +71,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
-  m_pointer_record(nullptr),
   m_elems(elems),
   m_offset(0),
+  m_pointer_record(nullptr),
   m_is_slice(false)
 {
   this->allocate(elems, space);
@@ -85,9 +85,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
   m_active_pointer(nullptr),
   m_active_base_pointer(nullptr),
   m_resource_manager(nullptr),
-  m_pointer_record(nullptr),
   m_elems(0),
   m_offset(0),
+  m_pointer_record(nullptr),
   m_is_slice(false)
 {
 }
@@ -99,9 +99,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) :
   m_active_pointer(other.m_active_pointer),
   m_active_base_pointer(other.m_active_base_pointer),
   m_resource_manager(other.m_resource_manager),
-  m_pointer_record(other.m_pointer_record),
   m_elems(other.m_elems),
   m_offset(other.m_offset),
+  m_pointer_record(other.m_pointer_record),
   m_is_slice(other.m_is_slice)
 {
 }
@@ -112,9 +112,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data, ArrayManager* array_mana
   m_active_pointer(data), 
   m_active_base_pointer(data),
   m_resource_manager(array_manager),
-  m_pointer_record(pointer_record),
   m_elems(elems),
   m_offset(0),
+  m_pointer_record(pointer_record),
   m_is_slice(false)
 {
 }
@@ -135,8 +135,9 @@ CHAI_HOST ManagedArray<T> ManagedArray<T>::slice(size_t offset, size_t elems) {
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST void ManagedArray<T>::allocate(size_t elems, ExecutionSpace space, UserCallback const &cback) {
+CHAI_HOST void ManagedArray<T>::allocate(size_t elems, ExecutionSpace space, UserCallback const &) {
   if(!m_is_slice) {
+    (void) space; // Quiet compiler warning when CHAI_LOG does nothing
     CHAI_LOG("ManagedArray", "Allocating array of size " << elems << " in space " << space);
 
     m_elems = elems;

From 9683d9d63657ab9c66df80fb3a6341ecb8035db4 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 12:19:46 -0700
Subject: [PATCH 16/58] Fix linking error

---
 src/chai/ManagedArray_thin.inl | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 25edc36f..6ac92bcb 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -51,6 +51,35 @@
 
 namespace chai {
 
+template <typename T>
+CHAI_INLINE
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
+   std::initializer_list<chai::ExecutionSpace> spaces,
+   std::initializer_list<umpire::Allocator> allocators) :
+  ManagedArray()
+{
+  if (m_pointer_record) {
+     int i = 0;
+
+     for (auto& space : spaces) {
+       m_pointer_record->m_allocators[space] = allocators.begin()[i++].getId();
+     }
+  }
+}
+
+template <typename T>
+CHAI_INLINE
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
+    size_t elems,
+    std::initializer_list<chai::ExecutionSpace> spaces,
+    std::initializer_list<umpire::Allocator> allocators,
+    ExecutionSpace space) :
+  ManagedArray(spaces, allocators)
+{
+  m_elems = elems;
+  this->allocate(elems, space);
+}
+
 template<typename T>
 CHAI_INLINE
 CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() :

From 03498099e157566ca4dc70b738e2f16cfb8e0cc0 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 12:42:04 -0700
Subject: [PATCH 17/58] Add log messages for attempting to do certain
 operations on slices

---
 src/chai/.ManagedArray_thin.inl.swp | Bin 0 -> 28672 bytes
 src/chai/ManagedArray_thin.inl      |  53 +++++++++++++++++++---------
 2 files changed, 37 insertions(+), 16 deletions(-)
 create mode 100644 src/chai/.ManagedArray_thin.inl.swp

diff --git a/src/chai/.ManagedArray_thin.inl.swp b/src/chai/.ManagedArray_thin.inl.swp
new file mode 100644
index 0000000000000000000000000000000000000000..e811814f55fef2bdd6220e283b0cf045c460b020
GIT binary patch
literal 28672
zcmeI450D$xeaDvqA%Oy!G!0ENX`T<pBOT{E8yX<8IU?(HJ{6sGNIDzGWi&cz&$pm|
zS*>ggC4saKU?3zZkT5V!LX)OIN=s7yHYA0nWau>0(8*9T1ZXJ`m^S2JXPQ5R^z(bW
zds^wF`(q5ubXGGT>$JP?-}n9Y?fd=SThDFWo|;ytkN&>F=dFfu;s+0$_t<574oiLF
zFr!l1=QQeP$eCq7_C=+7%dDQZY1yj`W31h>$9CjT8S^q8D^<-?D`Vw_k~P+{%kCf3
zrFv<%S&7?rX<yM<u<D~$y}Cli;ptF;P=Qq_aIA6aDVsLRg(nWJQ@{1L*;S+oF9{V0
z6$ljw6$ljw6$ljw6$ljw75M+6fYUtE_%Mz64X;7A_kCE;_mST3cYEKhp7Td|zkQG5
zV?F2J<ozD;zQ5md{&4U2r1$-D&w1}>_!TM;DiA6VDiA6VDiA6VDiA6VDiA6VDiA6V
zDsT`Alq!ZHzWWQ}@U#B!`~UYGZx~fL7YguB_-**#;|$}c@H9LDx59<sz$m<MtYLf)
z9*67T9GHeT!2`c(7?(jE&WCmI^f89<7jO&gf_K2Tjy8<_unm6w7Q?t3ny?98cr#_f
z0f@uz!7q<8j9<W0@FYA4e-1apO>h9Ngcj_E)4_m;2};}rUx6FpO1J{%;T>>190PBG
zH^WZ|f;<m*!{?z2r@{&FKZy7fcpAPA55QO9i*OSxz}XOmqv2%&9zTF@!&l*cxEJn$
zTi^zmhb{0P_!R+~r{JIA0r)Ij4F}+SSb*J-gGo3Nbf0=Qp;WC{Dm&I5v)F7{b;q<L
z^>($|bnNJek(i37ic{Hqp*WG;o=PNDa1+Ls!gwqeJP>I)l~~M6U38)<=4Oz}q*Ixs
z;h42%wd9yvoPABRUaFa@Fm4!^7)r65qqSkYRIN73oR64Qv(_3>t!AlgMmHN5b49Ia
zwTi8(RW>8@rE1HJ>SJDoi?&&A*p)!NB|&3;zGXU*jWk1k>P+3Ny(6~SYP9XLS*+=u
zCBb$FM42nK`Y98*VySG3Ps_T<EIP_-OY%aq+;*%+Jui*&n&#`XdVyD^=b_ZgSMRmc
zYK<m#sElgSB-EL1AGvncZO+nH7@}vG4Z9^~r_`X%lymwPu<Dj$m8w{PU94IyXG^=*
zv}`jLi@PS?u*U<LihT}BC|jUnvF-xLUGr&q>4J8))*H63J*lNPsNAG{yf4wN`_|hq
z&LHBqt+h+vx#yv5=h*0+x!bBoqPwh<Pd;ySx9OxR5$<iB?R>*lky6`ftan?cVqOcQ
z>SDvagn>y}WVlh1So12<TLZaN48N-bb$1;^kBE_}gX@wtn)7NpK!!@Ks1j=sNlrBD
z6>Hu&(W;lL?TV?kl-re3(Qelrt7aBUO>1;voU`+)V$NH2vl7v@N@n6))5+rObd((Y
zq(ec$Mi-jRAx^r<lui{queaS58O|m1I#(@s$=I0Ma40?o8O>z#`P9}_I#t-IvXd%5
zo0w7)@#*-s<fxq95$Eu1K~5E>l4>@ebpK4{RX#ge*b&bqM^tiqGNV$HDn7A2l}}E1
z7fbSVD(>bzqOv(v$mR+vlFB5~vlFSzHkC<kOQ*IaGd%q`o27u<4l;<UcrKNf3sM;s
z&+L?p;u*5lzoZ`7oUUvj_Z?G7NiHc8*-Rl9PZUN}zK}~L3X+`aQ}q#AR*37W%;d6@
zsX~6U6ufno!jkzsi3`bGZg!@S%4VW!D!YS*<y0b0(h13BD?((l1zI(oD$veCc0`d;
zVm6nYPG+=7`Pr@cLaH!ZNUCkw>_lGj&nI*8gr~4HVoOFykeVPxJWVBMl8Kc3kvum!
zJ(G^-cIpb!ihS~%SuWzF*CY9Wcq5X}R5m?94Jku6Iz<JuxxCt%)WWBeYLZhZs3uam
zq$rZgcz;RixSEOQ3aP{_g{qm^9Hx*&`XpE96X|$rIyo^af@ZS1hYKn{6;G$N9VD_d
zJ9DXRQ(D}h9AYFXnvp^tj3XDP{z+7rRMW{ER!^HfW0M}G3$*Z5Qz|qRrw{T&y0fL$
z$#Z9N(l0b(X`a#;&yYs^o{>xnQvur~JL#eAs6C;!?(D9+CtqHd7Zgyk5g4Z$^UmIq
zZK}PNv(RWejGkuMnzzcTX<Lk~du_{c%(`ltcFk(Fn1$%OhGD$4PtBRC-7+i6X(&b&
zo^hrkNz-oB8jdO7m3G-_F_PF!5i4q*6MDWTMQ>0UlOj!<Bc{?*y2iZzJDtwt$M`3$
z>g491Z7EJ$<W_Ff9ow2?$hTXac2sy&>Z6?+RvP7Y&8#~mnLDXcy)s624P9i7siMtO
zwk4I^V^ye}v|kEyWvMqD=8Gd7uv$tU!%{jWS9mx#>MhAl*KEGQlWVV4->s-Xxmscj
zwUt%(GuohXX0T>0IX7q3OZGm+ldGm`ZL+paS1MCyS(<C?F)6FL&$f0iIGrZGnmXxD
zTUr|$<<4U%k7t2b8>i&9Kn<vhm`lxcVp6qQgQtnb)Eilt?Qlk03N};eXzMJLoF2J4
zT1qb6Drt2wED_7Ap6DIWjfs`+FELQpZhe_%swR_eCJ(KKYMHL0jA-J2NFHhaVAxUf
zR@H22Q6|ZwV$$TQxDBLOaGYi<78@JeP0sDP(Q>1vt#M2qX?`(co*NR`iR6aUH@^3b
z(@*!07sOs9bH#VFWNW*Tgt{!bYDV7`sN+eJF736=I`jH8_lR9<pkPL;S*ofu9blI*
z^nHuTwN11znU-mDPwC?c-&Rzl995@m+;rLo{yAfW^n>%x<5X~O@8ha;VpK7^l-eDN
zhwcVL{Qqn5q5lEjTKxY>&tH55U;hsHB3uJk!2xK41;Ze5fXCrcxCicnJK(S2({MHH
zf)s3nN!SFx3&+7D!~|}KTj4L^A~0bmtOJP|907;JOT-GEgh%0Ou;DB?5`Ij~;D_)7
zcpjdEZ^QlY88{z?Ap-XkQ@9uIfm`4PkXVBQaaa#0z)^4{904y%+yO+RheW6F6DklY
z5GoKV5GoKV5GwG-D&U5R6jQ=wB00t-asjTNct_o#CO0^5n@-!Vt6_1YB>Lus(*0mu
z@9F`^#Yfz11OrEUsi1*ZArs*<RnJ0!P9Ooio+Rtt=vW{;?jBfiJ;AGTS9UlnxM_v)
zi2kEkE)kaXjt!3!PV8O-iH^|D-k`S}^Wypb5{Vl%#s}~B=~MpfxMI;n7C=-aD$Z!j
zF&39Ybfl7%k-4f_wC1;{b#CagRn*sQnI+|(QnhWW5!EYIEat1LI_oTIG3VLw&e6q#
zvijmh5--uQ)BcMr?!w7ZH8zYB#my6aUMk+81s478uB(b2i%Afe_Lp#+B2h0%cP~UL
zC8xw0ZN33B=!)_#U4|+QD@iz_<{FKva?IA?r6_&HIfK-Zc(-Ruy#}&*6_r}-lESdx
z*%-NQ%yK3?C~&KzONmsWNEC>WU!M41KH7DU_EvJ2bzYx(7S=D-&#&sP^W6JGr79OH
z%d^lGE6v5Ll2Dscpho^8S=WsstBSICwid}ccC2uJ`^ljEOO%pe`GmQL*ie37y}V#I
z>bkVeI%2-g51AK>iP?#GF-|<NSd6Om>jwtYmssbXhE-WAam6g#5pPwnqpVul9`q9F
z$tS<gtrGwLVb-HhV|`ove}Dh}bNKfU!{^{yxD@^nreG2#AOWK=1pkUZe;?ck?}zuo
zukhQSfq#St;ZFE#xC3s4YhXK^0FU9vKL{7W86f`r?Qk1h57)sScpq$nm+;G<gJ<C?
zxEwwN?}sfAgZIHn@H_AXe*1Uf8}K0*gBOW=$bN#y;osmsxC}<&I5+~n!ajl9;8yrJ
zG+`^m;dwNB2EGe7!>8c}xE?MA2d3d{5Pdhof3R2J5qJ>VAbSWdfJs;nL-0=cPx?gm
z5<CgtgKxs$!Jon>;9B??TmudG0A%1yc!~W2vX9_qxE>Zj_7a#d0~7EH`vblSUxF{d
z<**<A1R79=lR!SJw0zc@UWMiNv3uqiJP0?x@%OZBFz|2b5yiktwH61>)!y3P6DCq9
z)*E&ASs!Jl(fuftq5L&$@8YCj$xs}q)zUGoX_YTn6ri`^fd}@=gWRBp-b^ZSb`(>S
z8Oox$ss1CZ1XP_!bmhYIi0Uq8>pauhZIP~p(4i4E#8LziUY14XZ6eBg@5#C$y<}~&
z0flFxzaHM(RamDtJ(TIPOqji{3~nU}Uc^=2JWMW+#iYRCG*wRnZL`Ljf;SbEXQXsl
zaON7!J*w+@qEwX3-ACS}<Qk}p9B;Q|*(2D+XUgHe#}`ece|2;^!(E(T*O^JW>+&o0
zoNn=<>bN)y7qzA=UeI~AsrMi(*MEaeU{Ni0L-{$e>ZIfTrAh^RYsJ)%+Onn7$KJjO
z^hb-qec#F1-E-ArDPAsOB<9{CrTgdg49Yv~&88?PU_IXTT*Azn^)J`aDPSm`5*-up
z3Sq^B`A=g$Qi!TyU7pT3U_wgiw#dAx+lV@imA1<HX|q;t?u&S}5QnLI80#)<(Vglf
zoiOK1?W*H#4bjJS2YTaDWc^~+{c9~k4H`@I&IhR}4#J|YTz}9XI^NaF?Vf|ohg^o-
zr7$X`J7dU@tOna<&NXWKm%Fzl{~?k%?;?nAjk;RoB3otFd^dWQB3tpihnuM7w)7AU
zu3h)2w@YZ*L3Xeh9TVdw^|M;_EgHCWIvKh5WS@Jert{aT_%}@N&7!-mkKOj>-hC%@
z%xFd9?ZjVnB5$A(gIV(j%kwu`t~srrKwGaY^6HI^D_L&b-6S@^ZgqnU$vGlV_^6xd
zW(iQ8)3!{9fC(Gvsw~g5-Jvd->|J1AgKrzE(lA?WI3QA0-%U1sj9>Cz)n}Hq*#x~k
zS1MoN#%a`gd6umzv0T)>XU}a(Ad)d)3gg?af8%LIykyJ84D^h~n@H-Zii}9^3`hSQ
z?L3h&YsYSz?vUl5)a7u-Gbw3+f2w-`=bBziRwpz1$X;7rCRup1qvb}hRGpbJUb0(=
z$~2T)jPjp+tms?pFuPF6eijC|xkkHQX}R++Yd+!@NXc%+*%GOJc(;do=CE!d!LUQ`
z1wFrsKj@vt-r(n!<=(IUj$SE9o81bXA_miuewn??Uc0@m%w9BX?OT}j!@%Fr+#OR3
z&JX>-T3~|WRlqx=^!oov_}hOA;{W^a_dkVC|8=+%Zh`&qL1;h?<lTS^U_1N)yn??k
zF@T$)3OP6pj)wojzkd<F56{D+ATa^)|KlKQ{8GNG?LP#61_$5@_$cg$CRE_<@Jsyv
zC*UKn7tVr{;3zm8eu}^UWB8Hy{qQevK5U2M;mi2`cf!qZ6I=pYAqGFk=l>D>5N?1^
z!pA}ONL&PU*aTznF1VXGz+LbKxE#)b8Td86{wweld<XW!AHl^i2U{Ty&*SSq1OE<B
z!#}~-;8SotTn1%Gz-D+iydB;O$HB4iGkpI4hHt<_@b@6^Abbrz3s=F1p$_kX7x4eT
z1rNiG@F|e@4K9N=jKR<F{hx%#;9j^J?gDx5;5wLvJoNh{^eq$c3kQsXVZdP7*CoP$
z0ZSKy=1H$@z#uSP^!)VRr{Rj8chG4X)CloZWd5@Dx%(mYq&sbU)&8&lTz{QC+To7p
z(&?C0__L_LW`}loS$DRRqn_g}<Ie`_EHkR*ZKRJg>raQ5hO}1cH%dUG*CUl<dak8q
zJQ&M-9lXjed-bQ%uahe{?5fkTvOyi*sif;>1{^^zkCl2|=iJ{79NhQul|3)F-#fmX
z{0{nuhsY`G9n=;l1p{USen)ouMExBsHOOz%g{&}HUtH|f(|_&FXsLZq$bSVf^q_iI
z0(yVHK^~M^+h@kT0dxB8(P7{9^j$Fb9k8|J==qC!AaGyKTSMyL-iTp0sOS4}fv~~q
z-oy1bKpc#Vs$Z@MgbVt<Y3pC)J=J~5+&67KuhqZC{ptClTAPS}${jM@RK1&R7BA{O
zv5Z&i=i6t!r8|{Z@5M5@V;o$&I3}%5@L&MJjTWtU-%?rK>#BNnYn6clf9Fw`*8Xd{
zuDt%QUgdgTknW>W;0js)Z{st^@vFuEAMqTzXYlDCfV<$6a3x#;Wsr6Mv*0u^;AMRL
zU%=1dS&(-E{s!)VFTw3_8_4?rd*K5302~9~!S}xpz6|@}kKrRwgA%+8o~3S&fW!tq
z54XVQK;8qm6m;GDIilgQP=Qc^P=Qc^P=Qc^P=Qc^P=Qc^RVm<lMtxRtdJipeq55yy
x3)gzWwI2Cj#+HlC4GMw3wrf4h?O%9<H-|39kb{f*eL~OO+x3=Syv%<Q^nZw(o9F-l

literal 0
HcmV?d00001

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 6ac92bcb..500430e7 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -137,7 +137,10 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) :
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data, ArrayManager* array_manager, size_t elems, PointerRecord* pointer_record) :
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data,
+                                               ArrayManager* array_manager,
+                                               size_t elems,
+                                               PointerRecord* pointer_record) :
   m_active_pointer(data), 
   m_active_base_pointer(data),
   m_resource_manager(array_manager),
@@ -164,58 +167,76 @@ CHAI_HOST ManagedArray<T> ManagedArray<T>::slice(size_t offset, size_t elems) {
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST void ManagedArray<T>::allocate(size_t elems, ExecutionSpace space, UserCallback const &) {
-  if(!m_is_slice) {
+CHAI_HOST void ManagedArray<T>::allocate(size_t elems,
+                                         ExecutionSpace space,
+                                         UserCallback const &) {
+  if (!m_is_slice) {
     (void) space; // Quiet compiler warning when CHAI_LOG does nothing
-    CHAI_LOG("ManagedArray", "Allocating array of size " << elems << " in space " << space);
+    CHAI_LOG("ManagedArray", "Allocating array of size " << elems
+                                                         << " in space "
+                                                         << space);
 
     m_elems = elems;
 
   #if defined(CHAI_ENABLE_UM)
-    cudaMallocManaged(&m_active_pointer, sizeof(T)*elems);
+    cudaMallocManaged(&m_active_pointer, sizeof(T) * elems);
   #else
-    m_active_pointer = static_cast<T*>(malloc(sizeof(T)*elems));
+    m_active_pointer = static_cast<T*>(malloc(sizeof(T) * elems));
   #endif
 
     CHAI_LOG("ManagedArray", "m_active_ptr allocated at address: " << m_active_pointer);
   }
+  else {
+    CHAI_LOG("ManagedArray", "Attempted to allocate slice!");
+  }
 }
 
 template<typename T>
 CHAI_INLINE
 CHAI_HOST void ManagedArray<T>::reallocate(size_t new_elems)
 {
-  if(!m_is_slice) {
-    CHAI_LOG("ManagedArray", "Reallocating array of size " << m_elems << " with new size" << elems);
+  if (!m_is_slice) {
+    CHAI_LOG("ManagedArray", "Reallocating array of size " << m_elems
+                                                           << " with new size"
+                                                           << elems);
 
     T* new_ptr;
-  #if defined(CHAI_ENABLE_UM)
-    cudaMallocManaged(&new_ptr, sizeof(T)*new_elems);
-
-    cudaMemcpy(new_ptr, m_active_pointer, sizeof(T)*m_elems, cudaMemcpyDefault);
 
+  #if defined(CHAI_ENABLE_UM)
+    cudaMallocManaged(&new_ptr, sizeof(T) * new_elems);
+    cudaMemcpy(new_ptr, m_active_pointer, sizeof(T) * m_elems, cudaMemcpyDefault);
     cudaFree(m_active_pointer);
   #else  
-    new_ptr = static_cast<T*>(realloc(m_active_pointer, sizeof(T)*new_elems));
+    new_ptr = static_cast<T*>(realloc(m_active_pointer, sizeof(T) * new_elems));
   #endif
 
     m_elems = new_elems;
     m_active_pointer = new_ptr;
+    m_active_base_pointer = m_active_pointer;
 
     CHAI_LOG("ManagedArray", "m_active_ptr reallocated at address: " << m_active_pointer);
   }
+  else {
+    CHAI_LOG("ManagedArray", "Attempted to realloc slice!");
+  }
 }
 
 template<typename T>
 CHAI_INLINE
 CHAI_HOST void ManagedArray<T>::free()
 {
-  if(!m_is_slice) {
+  if (!m_is_slice) {
   #if defined(CHAI_ENABLE_UM)
-    cudaFree(m_active_pointer);
+    cudaFree(m_active_base_pointer);
   #else
-    ::free(m_active_pointer);
+    ::free(m_active_base_pointer);
   #endif
+
+    m_active_base_pointer = nullptr;
+    m_active_pointer = nullptr;
+  }
+  else {
+    CHAI_LOG("ManagedArray", "tried to free slice!");
   }
 }
 

From 9892e5afac9e0bea71e0e85b6b312c7468b56848 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 14:12:42 -0700
Subject: [PATCH 18/58] Use default initializers

---
 src/chai/.ManagedArray_thin.inl.swp | Bin 28672 -> 0 bytes
 src/chai/ManagedArray_thin.inl      |  35 +++++-----------------------
 2 files changed, 6 insertions(+), 29 deletions(-)
 delete mode 100644 src/chai/.ManagedArray_thin.inl.swp

diff --git a/src/chai/.ManagedArray_thin.inl.swp b/src/chai/.ManagedArray_thin.inl.swp
deleted file mode 100644
index e811814f55fef2bdd6220e283b0cf045c460b020..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 28672
zcmeI450D$xeaDvqA%Oy!G!0ENX`T<pBOT{E8yX<8IU?(HJ{6sGNIDzGWi&cz&$pm|
zS*>ggC4saKU?3zZkT5V!LX)OIN=s7yHYA0nWau>0(8*9T1ZXJ`m^S2JXPQ5R^z(bW
zds^wF`(q5ubXGGT>$JP?-}n9Y?fd=SThDFWo|;ytkN&>F=dFfu;s+0$_t<574oiLF
zFr!l1=QQeP$eCq7_C=+7%dDQZY1yj`W31h>$9CjT8S^q8D^<-?D`Vw_k~P+{%kCf3
zrFv<%S&7?rX<yM<u<D~$y}Cli;ptF;P=Qq_aIA6aDVsLRg(nWJQ@{1L*;S+oF9{V0
z6$ljw6$ljw6$ljw6$ljw75M+6fYUtE_%Mz64X;7A_kCE;_mST3cYEKhp7Td|zkQG5
zV?F2J<ozD;zQ5md{&4U2r1$-D&w1}>_!TM;DiA6VDiA6VDiA6VDiA6VDiA6VDiA6V
zDsT`Alq!ZHzWWQ}@U#B!`~UYGZx~fL7YguB_-**#;|$}c@H9LDx59<sz$m<MtYLf)
z9*67T9GHeT!2`c(7?(jE&WCmI^f89<7jO&gf_K2Tjy8<_unm6w7Q?t3ny?98cr#_f
z0f@uz!7q<8j9<W0@FYA4e-1apO>h9Ngcj_E)4_m;2};}rUx6FpO1J{%;T>>190PBG
zH^WZ|f;<m*!{?z2r@{&FKZy7fcpAPA55QO9i*OSxz}XOmqv2%&9zTF@!&l*cxEJn$
zTi^zmhb{0P_!R+~r{JIA0r)Ij4F}+SSb*J-gGo3Nbf0=Qp;WC{Dm&I5v)F7{b;q<L
z^>($|bnNJek(i37ic{Hqp*WG;o=PNDa1+Ls!gwqeJP>I)l~~M6U38)<=4Oz}q*Ixs
z;h42%wd9yvoPABRUaFa@Fm4!^7)r65qqSkYRIN73oR64Qv(_3>t!AlgMmHN5b49Ia
zwTi8(RW>8@rE1HJ>SJDoi?&&A*p)!NB|&3;zGXU*jWk1k>P+3Ny(6~SYP9XLS*+=u
zCBb$FM42nK`Y98*VySG3Ps_T<EIP_-OY%aq+;*%+Jui*&n&#`XdVyD^=b_ZgSMRmc
zYK<m#sElgSB-EL1AGvncZO+nH7@}vG4Z9^~r_`X%lymwPu<Dj$m8w{PU94IyXG^=*
zv}`jLi@PS?u*U<LihT}BC|jUnvF-xLUGr&q>4J8))*H63J*lNPsNAG{yf4wN`_|hq
z&LHBqt+h+vx#yv5=h*0+x!bBoqPwh<Pd;ySx9OxR5$<iB?R>*lky6`ftan?cVqOcQ
z>SDvagn>y}WVlh1So12<TLZaN48N-bb$1;^kBE_}gX@wtn)7NpK!!@Ks1j=sNlrBD
z6>Hu&(W;lL?TV?kl-re3(Qelrt7aBUO>1;voU`+)V$NH2vl7v@N@n6))5+rObd((Y
zq(ec$Mi-jRAx^r<lui{queaS58O|m1I#(@s$=I0Ma40?o8O>z#`P9}_I#t-IvXd%5
zo0w7)@#*-s<fxq95$Eu1K~5E>l4>@ebpK4{RX#ge*b&bqM^tiqGNV$HDn7A2l}}E1
z7fbSVD(>bzqOv(v$mR+vlFB5~vlFSzHkC<kOQ*IaGd%q`o27u<4l;<UcrKNf3sM;s
z&+L?p;u*5lzoZ`7oUUvj_Z?G7NiHc8*-Rl9PZUN}zK}~L3X+`aQ}q#AR*37W%;d6@
zsX~6U6ufno!jkzsi3`bGZg!@S%4VW!D!YS*<y0b0(h13BD?((l1zI(oD$veCc0`d;
zVm6nYPG+=7`Pr@cLaH!ZNUCkw>_lGj&nI*8gr~4HVoOFykeVPxJWVBMl8Kc3kvum!
zJ(G^-cIpb!ihS~%SuWzF*CY9Wcq5X}R5m?94Jku6Iz<JuxxCt%)WWBeYLZhZs3uam
zq$rZgcz;RixSEOQ3aP{_g{qm^9Hx*&`XpE96X|$rIyo^af@ZS1hYKn{6;G$N9VD_d
zJ9DXRQ(D}h9AYFXnvp^tj3XDP{z+7rRMW{ER!^HfW0M}G3$*Z5Qz|qRrw{T&y0fL$
z$#Z9N(l0b(X`a#;&yYs^o{>xnQvur~JL#eAs6C;!?(D9+CtqHd7Zgyk5g4Z$^UmIq
zZK}PNv(RWejGkuMnzzcTX<Lk~du_{c%(`ltcFk(Fn1$%OhGD$4PtBRC-7+i6X(&b&
zo^hrkNz-oB8jdO7m3G-_F_PF!5i4q*6MDWTMQ>0UlOj!<Bc{?*y2iZzJDtwt$M`3$
z>g491Z7EJ$<W_Ff9ow2?$hTXac2sy&>Z6?+RvP7Y&8#~mnLDXcy)s624P9i7siMtO
zwk4I^V^ye}v|kEyWvMqD=8Gd7uv$tU!%{jWS9mx#>MhAl*KEGQlWVV4->s-Xxmscj
zwUt%(GuohXX0T>0IX7q3OZGm+ldGm`ZL+paS1MCyS(<C?F)6FL&$f0iIGrZGnmXxD
zTUr|$<<4U%k7t2b8>i&9Kn<vhm`lxcVp6qQgQtnb)Eilt?Qlk03N};eXzMJLoF2J4
zT1qb6Drt2wED_7Ap6DIWjfs`+FELQpZhe_%swR_eCJ(KKYMHL0jA-J2NFHhaVAxUf
zR@H22Q6|ZwV$$TQxDBLOaGYi<78@JeP0sDP(Q>1vt#M2qX?`(co*NR`iR6aUH@^3b
z(@*!07sOs9bH#VFWNW*Tgt{!bYDV7`sN+eJF736=I`jH8_lR9<pkPL;S*ofu9blI*
z^nHuTwN11znU-mDPwC?c-&Rzl995@m+;rLo{yAfW^n>%x<5X~O@8ha;VpK7^l-eDN
zhwcVL{Qqn5q5lEjTKxY>&tH55U;hsHB3uJk!2xK41;Ze5fXCrcxCicnJK(S2({MHH
zf)s3nN!SFx3&+7D!~|}KTj4L^A~0bmtOJP|907;JOT-GEgh%0Ou;DB?5`Ij~;D_)7
zcpjdEZ^QlY88{z?Ap-XkQ@9uIfm`4PkXVBQaaa#0z)^4{904y%+yO+RheW6F6DklY
z5GoKV5GoKV5GwG-D&U5R6jQ=wB00t-asjTNct_o#CO0^5n@-!Vt6_1YB>Lus(*0mu
z@9F`^#Yfz11OrEUsi1*ZArs*<RnJ0!P9Ooio+Rtt=vW{;?jBfiJ;AGTS9UlnxM_v)
zi2kEkE)kaXjt!3!PV8O-iH^|D-k`S}^Wypb5{Vl%#s}~B=~MpfxMI;n7C=-aD$Z!j
zF&39Ybfl7%k-4f_wC1;{b#CagRn*sQnI+|(QnhWW5!EYIEat1LI_oTIG3VLw&e6q#
zvijmh5--uQ)BcMr?!w7ZH8zYB#my6aUMk+81s478uB(b2i%Afe_Lp#+B2h0%cP~UL
zC8xw0ZN33B=!)_#U4|+QD@iz_<{FKva?IA?r6_&HIfK-Zc(-Ruy#}&*6_r}-lESdx
z*%-NQ%yK3?C~&KzONmsWNEC>WU!M41KH7DU_EvJ2bzYx(7S=D-&#&sP^W6JGr79OH
z%d^lGE6v5Ll2Dscpho^8S=WsstBSICwid}ccC2uJ`^ljEOO%pe`GmQL*ie37y}V#I
z>bkVeI%2-g51AK>iP?#GF-|<NSd6Om>jwtYmssbXhE-WAam6g#5pPwnqpVul9`q9F
z$tS<gtrGwLVb-HhV|`ove}Dh}bNKfU!{^{yxD@^nreG2#AOWK=1pkUZe;?ck?}zuo
zukhQSfq#St;ZFE#xC3s4YhXK^0FU9vKL{7W86f`r?Qk1h57)sScpq$nm+;G<gJ<C?
zxEwwN?}sfAgZIHn@H_AXe*1Uf8}K0*gBOW=$bN#y;osmsxC}<&I5+~n!ajl9;8yrJ
zG+`^m;dwNB2EGe7!>8c}xE?MA2d3d{5Pdhof3R2J5qJ>VAbSWdfJs;nL-0=cPx?gm
z5<CgtgKxs$!Jon>;9B??TmudG0A%1yc!~W2vX9_qxE>Zj_7a#d0~7EH`vblSUxF{d
z<**<A1R79=lR!SJw0zc@UWMiNv3uqiJP0?x@%OZBFz|2b5yiktwH61>)!y3P6DCq9
z)*E&ASs!Jl(fuftq5L&$@8YCj$xs}q)zUGoX_YTn6ri`^fd}@=gWRBp-b^ZSb`(>S
z8Oox$ss1CZ1XP_!bmhYIi0Uq8>pauhZIP~p(4i4E#8LziUY14XZ6eBg@5#C$y<}~&
z0flFxzaHM(RamDtJ(TIPOqji{3~nU}Uc^=2JWMW+#iYRCG*wRnZL`Ljf;SbEXQXsl
zaON7!J*w+@qEwX3-ACS}<Qk}p9B;Q|*(2D+XUgHe#}`ece|2;^!(E(T*O^JW>+&o0
zoNn=<>bN)y7qzA=UeI~AsrMi(*MEaeU{Ni0L-{$e>ZIfTrAh^RYsJ)%+Onn7$KJjO
z^hb-qec#F1-E-ArDPAsOB<9{CrTgdg49Yv~&88?PU_IXTT*Azn^)J`aDPSm`5*-up
z3Sq^B`A=g$Qi!TyU7pT3U_wgiw#dAx+lV@imA1<HX|q;t?u&S}5QnLI80#)<(Vglf
zoiOK1?W*H#4bjJS2YTaDWc^~+{c9~k4H`@I&IhR}4#J|YTz}9XI^NaF?Vf|ohg^o-
zr7$X`J7dU@tOna<&NXWKm%Fzl{~?k%?;?nAjk;RoB3otFd^dWQB3tpihnuM7w)7AU
zu3h)2w@YZ*L3Xeh9TVdw^|M;_EgHCWIvKh5WS@Jert{aT_%}@N&7!-mkKOj>-hC%@
z%xFd9?ZjVnB5$A(gIV(j%kwu`t~srrKwGaY^6HI^D_L&b-6S@^ZgqnU$vGlV_^6xd
zW(iQ8)3!{9fC(Gvsw~g5-Jvd->|J1AgKrzE(lA?WI3QA0-%U1sj9>Cz)n}Hq*#x~k
zS1MoN#%a`gd6umzv0T)>XU}a(Ad)d)3gg?af8%LIykyJ84D^h~n@H-Zii}9^3`hSQ
z?L3h&YsYSz?vUl5)a7u-Gbw3+f2w-`=bBziRwpz1$X;7rCRup1qvb}hRGpbJUb0(=
z$~2T)jPjp+tms?pFuPF6eijC|xkkHQX}R++Yd+!@NXc%+*%GOJc(;do=CE!d!LUQ`
z1wFrsKj@vt-r(n!<=(IUj$SE9o81bXA_miuewn??Uc0@m%w9BX?OT}j!@%Fr+#OR3
z&JX>-T3~|WRlqx=^!oov_}hOA;{W^a_dkVC|8=+%Zh`&qL1;h?<lTS^U_1N)yn??k
zF@T$)3OP6pj)wojzkd<F56{D+ATa^)|KlKQ{8GNG?LP#61_$5@_$cg$CRE_<@Jsyv
zC*UKn7tVr{;3zm8eu}^UWB8Hy{qQevK5U2M;mi2`cf!qZ6I=pYAqGFk=l>D>5N?1^
z!pA}ONL&PU*aTznF1VXGz+LbKxE#)b8Td86{wweld<XW!AHl^i2U{Ty&*SSq1OE<B
z!#}~-;8SotTn1%Gz-D+iydB;O$HB4iGkpI4hHt<_@b@6^Abbrz3s=F1p$_kX7x4eT
z1rNiG@F|e@4K9N=jKR<F{hx%#;9j^J?gDx5;5wLvJoNh{^eq$c3kQsXVZdP7*CoP$
z0ZSKy=1H$@z#uSP^!)VRr{Rj8chG4X)CloZWd5@Dx%(mYq&sbU)&8&lTz{QC+To7p
z(&?C0__L_LW`}loS$DRRqn_g}<Ie`_EHkR*ZKRJg>raQ5hO}1cH%dUG*CUl<dak8q
zJQ&M-9lXjed-bQ%uahe{?5fkTvOyi*sif;>1{^^zkCl2|=iJ{79NhQul|3)F-#fmX
z{0{nuhsY`G9n=;l1p{USen)ouMExBsHOOz%g{&}HUtH|f(|_&FXsLZq$bSVf^q_iI
z0(yVHK^~M^+h@kT0dxB8(P7{9^j$Fb9k8|J==qC!AaGyKTSMyL-iTp0sOS4}fv~~q
z-oy1bKpc#Vs$Z@MgbVt<Y3pC)J=J~5+&67KuhqZC{ptClTAPS}${jM@RK1&R7BA{O
zv5Z&i=i6t!r8|{Z@5M5@V;o$&I3}%5@L&MJjTWtU-%?rK>#BNnYn6clf9Fw`*8Xd{
zuDt%QUgdgTknW>W;0js)Z{st^@vFuEAMqTzXYlDCfV<$6a3x#;Wsr6Mv*0u^;AMRL
zU%=1dS&(-E{s!)VFTw3_8_4?rd*K5302~9~!S}xpz6|@}kKrRwgA%+8o~3S&fW!tq
z54XVQK;8qm6m;GDIilgQP=Qc^P=Qc^P=Qc^P=Qc^P=Qc^RVm<lMtxRtdJipeq55yy
x3)gzWwI2Cj#+HlC4GMw3wrf4h?O%9<H-|39kb{f*eL~OO+x3=Syv%<Q^nZw(o9F-l

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 500430e7..bdf47069 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -82,42 +82,21 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() :
-  m_active_pointer(nullptr),
-  m_active_base_pointer(nullptr),
-  m_resource_manager(nullptr),
-  m_elems(0),
-  m_offset(0),
-  m_pointer_record(nullptr),
-  m_is_slice(false)
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray()
 {
 }
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
-    size_t elems, ExecutionSpace space) :
-  m_active_pointer(nullptr),
-  m_active_base_pointer(nullptr),
-  m_resource_manager(nullptr),
-  m_elems(elems),
-  m_offset(0),
-  m_pointer_record(nullptr),
-  m_is_slice(false)
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(size_t elems, ExecutionSpace space) :
+  m_elems(elems)
 {
   this->allocate(elems, space);
 }
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t) :
-  m_active_pointer(nullptr),
-  m_active_base_pointer(nullptr),
-  m_resource_manager(nullptr),
-  m_elems(0),
-  m_offset(0),
-  m_pointer_record(nullptr),
-  m_is_slice(false)
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t)
 {
 }
 
@@ -145,9 +124,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(T* data,
   m_active_base_pointer(data),
   m_resource_manager(array_manager),
   m_elems(elems),
-  m_offset(0),
-  m_pointer_record(pointer_record),
-  m_is_slice(false)
+  m_pointer_record(pointer_record)
 {
 }
 
@@ -155,7 +132,7 @@ template<typename T>
 CHAI_INLINE
 CHAI_HOST ManagedArray<T> ManagedArray<T>::slice(size_t offset, size_t elems) {
   ManagedArray<T> slice;
-  if(offset + elems > size()) {
+  if (offset + elems > size()) {
     CHAI_LOG("ManagedArray", "Invalid slice. No active pointer or index out of bounds");
   } else {
     slice.m_active_pointer = m_active_pointer + offset;

From 7a70daf7d8164d6d7d3171d7b91728cf3f043780 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 24 Sep 2019 14:23:10 -0700
Subject: [PATCH 19/58] Use implicitly generated constructors

---
 src/chai/ManagedArray_thin.inl | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index bdf47069..528751ee 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -82,9 +82,7 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(
 
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray()
-{
-}
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray() = default;
 
 template<typename T>
 CHAI_INLINE
@@ -100,19 +98,9 @@ CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(std::nullptr_t)
 {
 }
 
-
 template<typename T>
 CHAI_INLINE
-CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) :
-  m_active_pointer(other.m_active_pointer),
-  m_active_base_pointer(other.m_active_base_pointer),
-  m_resource_manager(other.m_resource_manager),
-  m_elems(other.m_elems),
-  m_offset(other.m_offset),
-  m_pointer_record(other.m_pointer_record),
-  m_is_slice(other.m_is_slice)
-{
-}
+CHAI_HOST_DEVICE ManagedArray<T>::ManagedArray(ManagedArray const& other) = default;
 
 template<typename T>
 CHAI_INLINE

From 30a844c4150d89eb0b2e5fa20cf3fe1ce8019b17 Mon Sep 17 00:00:00 2001
From: "Adam J. Kunen" <kunen1@llnl.gov>
Date: Fri, 27 Sep 2019 14:08:56 -0700
Subject: [PATCH 20/58] Modified to allow bringing chai and umpire into a
 higher level project with add_subdirectory

---
 cmake/thirdparty/SetupChaiThirdparty.cmake | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index 7c50a86e..ee8ffbb4 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -42,13 +42,16 @@
 #######################################################################
 
 set(ENABLE_FORTRAN Off CACHE BOOL "Enable Fortran in Umpire")
-if (DEFINED umpire_DIR)
-  find_package(umpire REQUIRED)
 
-  blt_register_library(
-    NAME umpire
-    INCLUDES ${UMPIRE_INCLUDE_DIRS}
-    LIBRARIES umpire)
-else ()
-  add_subdirectory(${PROJECT_SOURCE_DIR}/src/tpl/umpire)
+if (NOT TARGET umpire)
+  if (DEFINED umpire_DIR)
+    find_package(umpire REQUIRED)
+
+    blt_register_library(
+      NAME umpire
+      INCLUDES ${UMPIRE_INCLUDE_DIRS}
+      LIBRARIES umpire)
+  else ()
+    add_subdirectory(${PROJECT_SOURCE_DIR}/src/tpl/umpire)
+  endif()
 endif()

From eb8b8da8145793b3a0ca1ab8a7cf2e486d4f3be8 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Mon, 30 Sep 2019 12:20:38 -0700
Subject: [PATCH 21/58] Added managed_ptr

---
 src/chai/CMakeLists.txt  |    1 +
 src/chai/managed_ptr.hpp | 1375 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 1376 insertions(+)
 create mode 100644 src/chai/managed_ptr.hpp

diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt
index 7821fa3f..350cdf30 100644
--- a/src/chai/CMakeLists.txt
+++ b/src/chai/CMakeLists.txt
@@ -59,6 +59,7 @@ set (chai_headers
   ExecutionSpaces.hpp
   ManagedArray.hpp
   ManagedArray.inl
+  managed_ptr.hpp
   PointerRecord.hpp
   Types.hpp)
 
diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
new file mode 100644
index 00000000..36c25d95
--- /dev/null
+++ b/src/chai/managed_ptr.hpp
@@ -0,0 +1,1375 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+//
+// Produced at the Lawrence Livermore National Laboratory.
+//
+// This file is part of CHAI.
+//
+// LLNL-CODE-705877
+//
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+
+#ifndef MANAGED_PTR_H_
+#define MANAGED_PTR_H_
+
+#include "chai/config.hpp"
+
+#ifndef CHAI_DISABLE_RM
+#include "chai/ArrayManager.hpp"
+#endif
+
+#include "chai/ChaiMacros.hpp"
+#include "chai/ExecutionSpaces.hpp"
+#include "chai/ManagedArray.hpp"
+#include "chai/Types.hpp"
+
+// Standard libary headers
+#include <cstddef>
+#include <functional>
+
+#ifdef __CUDACC__
+
+inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true)
+{
+   if (code != cudaSuccess) {
+      fprintf(stderr, "[CHAI] GPU Error: %s %s %d\n", cudaGetErrorString(code), file, line);
+      if (abort) {
+         exit(code);
+      }
+   }
+}
+
+#if DEBUG
+#define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); }
+#else
+#define GPU_ERROR_CHECK(code) code
+#endif
+
+inline void debug_cudaDeviceSynchronize() {
+#if DEBUG
+   GPU_ERROR_CHECK(cudaDeviceSynchronize());
+#endif
+}
+
+#endif // __CUDACC__
+
+namespace chai {
+   namespace detail {
+#ifdef __CUDACC__
+      template <typename T>
+      __global__ void destroy_on_device(T* gpuPointer);
+#endif
+   }
+
+   struct managed_ptr_record {
+      managed_ptr_record() :
+         m_num_references(1),
+         m_callback()
+      {
+      }
+
+      managed_ptr_record(std::function<bool(Action, ExecutionSpace, void*)> callback) :
+         m_num_references(1),
+         m_callback(callback)
+      {
+      }
+
+      size_t use_count() {
+         return m_num_references;
+      }
+
+      void addReference() {
+         m_num_references++;
+      }
+
+      void removeReference() {
+         m_num_references--;
+      }
+
+      ExecutionSpace getLastSpace() {
+         return m_last_space;
+      }
+
+      void set_callback(std::function<bool(Action, ExecutionSpace, void*)> callback) {
+         m_callback = callback;
+      }
+
+      size_t m_num_references = 1; /// The reference counter
+      ExecutionSpace m_last_space = NONE; /// The last space executed in
+      std::function<bool(Action, ExecutionSpace, void*)> m_callback; /// Callback to handle events
+   };
+
+   ///
+   /// @class managed_ptr<T>
+   /// @author Alan Dayton
+   ///
+   /// This wrapper stores both host and device pointers so that polymorphism can be
+   ///    used in both contexts with a single API. It is modeled after std::shared_ptr,
+   ///    so it does reference counting and automatically cleans up when the last
+   ///    reference is destroyed. If we ever do multi-threading on the CPU, locking will
+   ///    need to be added to the reference counter.
+   /// The make_managed and make_managed_from_factory functions call new on both the
+   ///    host and device so that polymorphism is valid in both contexts. Simply copying
+   ///    an object to the device will not copy the vtable, so new must be called on
+   ///    the device.
+   ///
+   /// Usage Requirements:
+   ///    Methods that can be called on both the host and device must be declared
+   ///       with the __host__ __device__ specifiers. This includes constructors
+   ///       and destructors. Furthermore, destructors of base and child classes
+   ///       must all be declared virtual.
+   ///    This wrapper does NOT automatically sync the device object if the host object
+   ///       is updated and vice versa. If you wish to keep both instances in sync,
+   ///       you must explicitly modify the object in both the host context and the
+   ///       device context.
+   ///    Raw array members of T need to be initialized correctly with a host or
+   ///       device pointer. If a ManagedArray is passed to the make_managed or
+   ///       make_managed_from_factory methods in place of a raw array, it will be
+   ///       cast to the appropriate host or device pointer when passed to T's
+   ///       constructor on the host and on the device. If it is desired that these
+   ///       host and device pointers be kept in sync, define a callback that maintains
+   ///       a copy of the ManagedArray and upon the ACTION_MOVE event calls the copy
+   ///       constructor of that ManagedArray.
+   ///    If a raw array is passed to make_managed, accessing that member will be
+   ///       valid only in the correct context. To prevent the accidental use of that
+   ///       member in the wrong context, any methods that access it should be __host__
+   ///       only or __device__ only. Special care should be taken when passing raw
+   ///       arrays as arguments to member functions.
+   ///    The same restrictions for raw array members also apply to raw pointer members.
+   ///       A managed_ptr can be passed to the make_managed or make_managed_from_factory
+   ///       methods in place of a raw pointer, and the host constructor of T will
+   ///       be given the extracted host pointer, and likewise the device constructor
+   ///       of T will be given the extracted device pointer. It is recommended that
+   ///       a callback is defined that maintains a copy of the managed_ptr so that
+   ///       the raw pointers are not accidentally destroyed prematurely (since
+   ///       managed_ptr does reference counting). It is also recommended that the
+   ///       callback calls the copy constructor of the managed_ptr on the ACTION_MOVE
+   ///       event so that the ACTION_MOVE event is triggered also for the inner
+   ///       managed_ptr.
+   ///    Again, if a raw pointer is passed to make_managed, accessing that member will
+   ///       only be valid in the correct context. Take care when passing raw pointers
+   ///       as arguments to member functions.
+   ///    Be aware that only the debug version of CHAI will check for GPU errors. So
+   ///       if you are seeing strange behavior and/or your code crashes in the
+   ///       constructor/destructor of T, then build CHAI as debug to see what is
+   ///       going on. For example, the constructor of T might run out of per-thread
+   ///       stack space on the GPU. If that happens, you can increase the device
+   ///       limit of per-thread stack space. Alternatively, you could add a call
+   ///       to cudaDeviceSynchronize after calling make_managed and check the return
+   ///       code of cudaDeviceSynchronize.
+   ///
+   template <typename T>
+   class managed_ptr {
+      public:
+         using element_type = T;
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Default constructor.
+         /// Initializes the reference count to 0.
+         ///
+         CHAI_HOST_DEVICE constexpr managed_ptr() noexcept {}
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Construct from nullptr.
+         /// Initializes the reference count to 0.
+         ///
+         CHAI_HOST_DEVICE constexpr managed_ptr(std::nullptr_t) noexcept {}
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Constructs a managed_ptr from the given pointers. U* must be convertible
+         ///    to T*.
+         ///
+         /// @pre spaces.size() == pointers.size()
+         ///
+         /// @param[in] spaces A list of execution spaces
+         /// @param[in] pointers A list of pointers to take ownership of
+         ///
+         template <typename U>
+         managed_ptr(std::initializer_list<ExecutionSpace> spaces,
+                     std::initializer_list<U*> pointers) :
+            m_cpu_pointer(nullptr),
+            m_gpu_pointer(nullptr),
+            m_pointer_record(new managed_ptr_record())
+         {
+            static_assert(std::is_convertible<U*, T*>::value,
+                          "U* must be convertible to T*.");
+
+            // TODO: In c++14 convert to a static_assert
+            if (spaces.size() != pointers.size()) {
+               printf("[CHAI] WARNING: The number of spaces is different than the number of pointers given!\n");
+            }
+
+            int i = 0;
+
+            for (const auto& space : spaces) {
+               switch (space) {
+                  case CPU:
+                     m_cpu_pointer = pointers.begin()[i++];
+                     break;
+#ifdef __CUDACC__
+                  case GPU:
+                     m_gpu_pointer = pointers.begin()[i++];
+                     break;
+#endif
+                  default:
+                     ++i;
+                     printf("[CHAI] WARNING: Execution space not supported by chai::managed_ptr!\n");
+                     break;
+               }
+            }
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Constructs a managed_ptr from the given pointers and callback function.
+         ///    U* must be convertible to T*.
+         ///
+         /// @pre spaces.size() == pointers.size()
+         ///
+         /// @param[in] spaces A list of execution spaces
+         /// @param[in] pointers A list of pointers to take ownership of
+         /// @param[in] callback The user defined callback to call on trigger events
+         ///
+         template <typename U>
+         CHAI_HOST managed_ptr(std::initializer_list<ExecutionSpace> spaces,
+                               std::initializer_list<U*> pointers,
+                               std::function<bool(Action, ExecutionSpace, void*)> callback) :
+            m_cpu_pointer(nullptr),
+            m_gpu_pointer(nullptr),
+            m_pointer_record(new managed_ptr_record(callback))
+         {
+            static_assert(std::is_convertible<U*, T*>::value,
+                          "U* must be convertible to T*.");
+
+            // TODO: In c++14 convert to a static_assert
+            if (spaces.size() != pointers.size()) {
+               printf("[CHAI] WARNING: The number of spaces is different than the number of pointers given.\n");
+            }
+
+            int i = 0;
+
+            for (const auto& space : spaces) {
+               switch (space) {
+                  case CPU:
+                     m_cpu_pointer = pointers.begin()[i++];
+                     break;
+#ifdef __CUDACC__
+                  case GPU:
+                     m_gpu_pointer = pointers.begin()[i++];
+                     break;
+#endif
+                  default:
+                     ++i;
+                     printf("[CHAI] WARNING: Execution space not supported by chai::managed_ptr!\n");
+                     break;
+               }
+            }
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Copy constructor.
+         /// Constructs a copy of the given managed_ptr, increases the reference count,
+         ///    and if the execution space is different, calls the user defined callback
+         ///    with ACTION_MOVE for each of the execution spaces.
+         ///
+         /// @param[in] other The managed_ptr to copy
+         ///
+         CHAI_HOST_DEVICE managed_ptr(const managed_ptr& other) noexcept :
+            m_cpu_pointer(other.m_cpu_pointer),
+            m_gpu_pointer(other.m_gpu_pointer),
+            m_pointer_record(other.m_pointer_record)
+         {
+#ifndef __CUDA_ARCH__
+            addReference();
+            move();
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Converting constructor.
+         /// Constructs a copy of the given managed_ptr, increases the reference count,
+         ///    and if the execution space is different, calls the user defined callback
+         ///    with ACTION_MOVE for each of the execution spaces. U* must be convertible
+         ///    to T*.
+         ///
+         /// @param[in] other The managed_ptr to copy
+         ///
+         template <typename U>
+         CHAI_HOST_DEVICE managed_ptr(const managed_ptr<U>& other) noexcept :
+            m_cpu_pointer(other.m_cpu_pointer),
+            m_gpu_pointer(other.m_gpu_pointer),
+            m_pointer_record(other.m_pointer_record)
+         {
+            static_assert(std::is_convertible<U*, T*>::value,
+                          "U* must be convertible to T*.");
+
+#ifndef __CUDA_ARCH__
+            addReference();
+            move();
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Aliasing constructor.
+         /// Has the same ownership information as other, but holds different pointers.
+         ///
+         /// @pre spaces.size() == pointers.size()
+         ///
+         /// @param[in] other The managed_ptr to copy ownership information from
+         /// @param[in] spaces A list of execution spaces
+         /// @param[in] pointers A list of pointers to maintain a reference to
+         ///
+         template <typename U>
+         CHAI_HOST managed_ptr(const managed_ptr<U>& other,
+                               std::initializer_list<ExecutionSpace> spaces,
+                               std::initializer_list<T*> pointers) noexcept :
+            m_pointer_record(other.m_pointer_record)
+         {
+            // TODO: In c++14 convert to a static_assert
+            if (spaces.size() != pointers.size()) {
+               printf("[CHAI] WARNING: The number of spaces is different than the number of pointers given.\n");
+            }
+
+            int i = 0;
+
+            for (const auto& space : spaces) {
+               switch (space) {
+                  case CPU:
+                     m_cpu_pointer = pointers.begin()[i++];
+                     break;
+#ifdef __CUDACC__
+                  case GPU:
+                     m_gpu_pointer = pointers.begin()[i++];
+                     break;
+#endif
+                  default:
+                     ++i;
+                     printf("[CHAI] WARNING: Execution space not supported by chai::managed_ptr!\n");
+                     break;
+               }
+            }
+
+            addReference();
+            move();
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Destructor. Decreases the reference count and if this is the last reference,
+         ///    clean up.
+         ///
+         CHAI_HOST_DEVICE ~managed_ptr() {
+#ifdef __CUDACC__
+            // This trick came from Max Katz at Nvidia.
+            // Taking the address of this kernel ensures that it gets instantiated
+            // by the compiler and can be used within __CUDA_ARCH__. Without this,
+            // calling destroy_on_device within the confines of __CUDA_ARCH__ will
+            // always fail with error code 0x8 (invalid device function).
+            // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#restrictions
+            // From the CUDA Programming Guide Restrictions:
+            // "If a __global__ function template is instantiated and launched from
+            // the host, then the function template must be instantiated with the
+            // same template arguments irrespective of whether __CUDA_ARCH__ is
+            // defined and regardless of the value of __CUDA_ARCH__."
+            (void) &detail::destroy_on_device<T>;
+#endif
+
+#ifndef __CUDA_ARCH__
+            removeReference();
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Copy assignment operator.
+         /// Copies the given managed_ptr and increases the reference count.
+         ///
+         /// @param[in] other The managed_ptr to copy
+         ///
+         CHAI_HOST_DEVICE managed_ptr& operator=(const managed_ptr& other) noexcept {
+            if (this != &other) {
+#ifndef __CUDA_ARCH__
+               removeReference();
+#endif
+
+               m_cpu_pointer = other.m_cpu_pointer;
+               m_gpu_pointer = other.m_gpu_pointer;
+               m_pointer_record = other.m_pointer_record;
+
+#ifndef __CUDA_ARCH__
+               addReference();
+               move();
+#endif
+            }
+
+            return *this;
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Conversion copy assignment operator.
+         /// Copies the given managed_ptr and increases the reference count.
+         ///    U* must be convertible to T*.
+         ///
+         /// @param[in] other The managed_ptr to copy
+         ///
+         template <typename U>
+         CHAI_HOST_DEVICE managed_ptr& operator=(const managed_ptr<U>& other) noexcept {
+            static_assert(std::is_convertible<U*, T*>::value,
+                          "U* must be convertible to T*.");
+
+#ifndef __CUDA_ARCH__
+            removeReference();
+#endif
+
+            m_cpu_pointer = other.m_cpu_pointer;
+            m_gpu_pointer = other.m_gpu_pointer;
+            m_pointer_record = other.m_pointer_record;
+
+#ifndef __CUDA_ARCH__
+            addReference();
+            move();
+#endif
+
+            return *this;
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns the CPU or GPU pointer depending on the calling context.
+         ///
+         CHAI_HOST_DEVICE inline T* get() const {
+#ifndef __CUDA_ARCH__
+            move();
+            return m_cpu_pointer;
+#else
+            return m_gpu_pointer;
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns the pointer corresponding to the given execution space.
+         ///
+         /// @param[in] space The execution space
+         /// @param[in] move Whether or not to trigger the move event (default is true)
+         ///
+         CHAI_HOST inline T* get(const ExecutionSpace space, const bool move=true) const {
+            if (move) {
+               this->move();
+            }
+
+            switch (space) {
+               case CPU:
+                  return m_cpu_pointer;
+#ifdef __CUDACC__
+               case GPU:
+                  return m_gpu_pointer;
+#endif
+               default:
+                  return nullptr;
+            }
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns the CPU or GPU pointer depending on the calling context.
+         ///
+         CHAI_HOST_DEVICE inline T* operator->() const {
+#ifndef __CUDA_ARCH__
+            return m_cpu_pointer;
+#else
+            return m_gpu_pointer;
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns the CPU or GPU reference depending on the calling context.
+         ///
+         CHAI_HOST_DEVICE inline T& operator*() const {
+#ifndef __CUDA_ARCH__
+            return *m_cpu_pointer;
+#else
+            return *m_gpu_pointer;
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns the number of managed_ptrs owning these pointers.
+         ///
+         CHAI_HOST std::size_t use_count() const {
+            if (m_pointer_record) {
+               return m_pointer_record->use_count();
+            }
+            else {
+               return 0;
+            }
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Returns true if the contained pointer is not nullptr, false otherwise.
+         ///
+         CHAI_HOST_DEVICE inline explicit operator bool() const noexcept {
+            return get() != nullptr;
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Sets the callback, which can be used to handle specific actions.
+         /// ACTION_MOVE can be used to call the copy constructor for ManagedArrays.
+         /// ACTION_FREE can be used to provide a custom deleter operation. Use
+         ///    ExecutionSpace::NONE if freeing anything other than the actual object
+         ///    pointers.
+         ///
+         /// @param[in] callback The callback to call when certain actions occur
+         ///
+         CHAI_HOST void set_callback(std::function<bool(Action, ExecutionSpace, void*)> callback) {
+            if (m_pointer_record) {
+               m_pointer_record->set_callback(callback);
+            }
+            else {
+               printf("[CHAI] WARNING: No callback is allowed for managed_ptr that does not contain a valid pointer (i.e. the default or nullptr constructor was used)!\n");
+            }
+         }
+
+      private:
+         T* m_cpu_pointer = nullptr; /// The CPU pointer
+         T* m_gpu_pointer = nullptr; /// The GPU pointer
+         managed_ptr_record* m_pointer_record = nullptr; /// The pointer record
+
+         /// Needed for the converting constructor
+         template <typename U>
+         friend class managed_ptr;
+
+         /// Needed to use the make_managed API
+         template <typename U,
+                   typename... Args>
+         friend CHAI_HOST managed_ptr<U> make_managed(Args... args);
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// If the execution space has changed, calls the user provided callback
+         ///    with the ACTION_MOVE event.
+         ///
+         CHAI_HOST void move() const {
+#ifndef CHAI_DISABLE_RM
+            if (m_pointer_record) {
+               ExecutionSpace newSpace = ArrayManager::getInstance()->getExecutionSpace();
+
+               if (newSpace != NONE && newSpace != m_pointer_record->getLastSpace()) {
+                  m_pointer_record->m_last_space = newSpace;
+
+                  if (m_pointer_record->m_callback) {
+                     for (int space = NONE; space < NUM_EXECUTION_SPACES; ++space) {
+                        ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
+
+                        T* pointer = get(execSpace, false);
+
+                        using T_non_const = typename std::remove_const<T>::type;
+
+                        // We can use const_cast because can managed_ptr can only
+                        // be constructed with non const pointers.
+                        T_non_const* temp = const_cast<T_non_const*>(pointer);
+
+                        void* voidPointer = static_cast<void*>(temp);
+
+                        m_pointer_record->m_callback(ACTION_MOVE, execSpace, voidPointer);
+                     }
+                  }
+               }
+            }
+#endif
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Increments the reference count and calls the copy constructor to
+         ///    trigger data movement.
+         ///
+         CHAI_HOST void addReference() {
+            if (m_pointer_record) {
+               m_pointer_record->addReference();
+            }
+         }
+
+         ///
+         /// @author Alan Dayton
+         ///
+         /// Decrements the reference counter. If the resulting number of references
+         ///    is 0, clean up the object.
+         ///
+         CHAI_HOST void removeReference() {
+            if (m_pointer_record) {
+               m_pointer_record->removeReference();
+
+               if (m_pointer_record->use_count() == 0) {
+                  if (m_pointer_record->m_callback) {
+                     // Destroy device pointer first to take advantage of asynchrony
+                     for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
+                        ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
+                        T* pointer = get(execSpace, false);
+
+                        using T_non_const = typename std::remove_const<T>::type;
+
+                        // We can use const_cast because can managed_ptr can only
+                        // be constructed with non const pointers.
+                        T_non_const* temp = const_cast<T_non_const*>(pointer);
+                        void* voidPointer = static_cast<void*>(temp);
+
+                        if (!m_pointer_record->m_callback(ACTION_FREE,
+                                                          execSpace,
+                                                          voidPointer)) {
+                           switch (execSpace) {
+                              case CPU:
+                                 delete pointer;
+                                 break;
+#ifdef __CUDACC__
+                              case GPU:
+                              {
+                                 if (pointer) {
+                                    detail::destroy_on_device<<<1, 1>>>(temp);
+                                    debug_cudaDeviceSynchronize();
+                                 }
+
+                                 break;
+                              }
+#endif
+                              default:
+                                 break;
+                           }
+                        }
+                     }
+                  }
+                  else {
+                     // Destroy device pointer first to take advantage of asynchrony
+                     for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
+                        ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
+                        T* pointer = get(execSpace, false);
+
+                        switch (execSpace) {
+                           case CPU:
+                              delete pointer;
+                              break;
+#ifdef __CUDACC__
+                           case GPU:
+                           {
+                              if (pointer) {
+                                 detail::destroy_on_device<<<1, 1>>>(pointer);
+                                 debug_cudaDeviceSynchronize();
+                              }
+
+                              break;
+                           }
+#endif
+                           default:
+                              break;
+                        }
+                     }
+                  }
+
+                  delete m_pointer_record;
+               }
+            }
+         }
+
+   };
+
+   namespace detail {
+      ///
+      /// @author Alan Dayton
+      ///
+      /// This implementation of getRawPointers handles every non-CHAI type.
+      ///
+      /// @param[in] arg The non-CHAI type, which will simply be returned
+      ///
+      /// @return arg
+      ///
+      template <typename T>
+      CHAI_HOST_DEVICE T getRawPointers(T arg) {
+         return arg;
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// This implementation of getRawPointers handles the CHAI ManagedArray type.
+      ///
+      /// @param[in] arg The ManagedArray from which to extract a raw pointer
+      ///
+      /// @return arg cast to a raw pointer
+      ///
+      template <typename T>
+      CHAI_HOST_DEVICE T* getRawPointers(ManagedArray<T> arg) {
+         return (T*) arg;
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// This implementation of getRawPointers handles the CHAI managed_ptr type.
+      /// The managed_ptr type is not implicitly convertible to a raw pointer, so
+      ///    when using the make_managed API, it is necessary to pull the raw pointers
+      ///    out of the managed_ptr.
+      ///
+      /// @param[in] arg The managed_ptr from which to extract a raw pointer
+      ///
+      /// @return a raw pointer acquired from arg
+      ///
+      template <typename T>
+      CHAI_HOST_DEVICE T* getRawPointers(managed_ptr<T> arg) {
+         return arg.get();
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the host and returns a pointer to it.
+      /// This implementation of new_on_host is called when no arguments need to be
+      ///    converted to raw pointers.
+      ///
+      /// @param[in] args The arguments to T's constructor
+      ///
+      /// @return a pointer to the new object on the host
+      ///
+      template <typename T,
+                typename... Args,
+                typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
+      CHAI_HOST T* new_on_host(Args&&... args) {
+         return new T(args...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the host and returns a pointer to it.
+      /// This implementation of new_on_host is called when arguments do need to be
+      ///    converted to raw pointers.
+      ///
+      /// @param[in] args The arguments to T's constructor
+      ///
+      /// @return a pointer to the new object on the host
+      ///
+      template <typename T,
+                typename... Args,
+                typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
+      CHAI_HOST T* new_on_host(Args&&... args) {
+         return new T(getRawPointers(args)...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new T on the host.
+      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
+      ///    are moved to the host as necessary.
+      ///
+      /// @param[in]  args The arguments to T's constructor
+      ///
+      /// @return The host pointer to the new T
+      ///
+      template <typename T,
+                typename... Args>
+      CHAI_HOST T* make_on_host(Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that ManagedArrays and managed_ptrs
+         // are handled properly
+         arrayManager->setExecutionSpace(CPU);
+#endif
+
+         // Create on the host
+         T* cpuPointer = detail::new_on_host<T>(args...);
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the CPU pointer
+         return cpuPointer;
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Calls a factory method to create a new object on the host.
+      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
+      ///    are moved to the host as necessary.
+      ///
+      /// @param[in]  f    The factory method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @return The host pointer to the new object
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that ManagedArrays and managed_ptrs
+         // are handled properly
+         arrayManager->setExecutionSpace(CPU);
+#endif
+
+         // Create the object on the device
+         T* cpuPointer = f(args...);
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the GPU pointer
+         return cpuPointer;
+      }
+
+#ifdef __CUDACC__
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the device and returns a pointer to it.
+      /// This implementation of new_on_device is called when no arguments need to be
+      ///    converted to raw pointers.
+      ///
+      /// @param[in] args The arguments to T's constructor
+      ///
+      /// @return a pointer to the new object on the device
+      ///
+      template <typename T,
+                typename... Args,
+                typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
+      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
+         *gpuPointer = new T(args...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the device and returns a pointer to it.
+      /// This implementation of new_on_device is called when arguments do need to be
+      ///    converted to raw pointers.
+      ///
+      /// @param[in] args The arguments to T's constructor
+      ///
+      /// @return a pointer to the new object on the device
+      ///
+      template <typename T,
+                typename... Args,
+                typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
+      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
+         *gpuPointer = new T(getRawPointers(args)...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new T on the device.
+      ///
+      /// @param[out] gpuPointer Used to return the device pointer to the new T
+      /// @param[in]  args The arguments to T's constructor
+      ///
+      /// @note Cannot capture argument packs in an extended device lambda,
+      ///       so explicit kernel is needed.
+      ///
+      template <typename T,
+                typename... Args>
+      __global__ void make_on_device(T** gpuPointer, Args... args)
+      {
+         new_on_device(gpuPointer, args...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the device by calling the given factory method.
+      ///
+      /// @param[out] gpuPointer Used to return the device pointer to the new object
+      /// @param[in]  f The factory method (must be a __device__ or __host__ __device__
+      ///                method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @note Cannot capture argument packs in an extended device lambda,
+      ///       so explicit kernel is needed.
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      __global__ void make_on_device_from_factory(T** gpuPointer, F f, Args... args)
+      {
+         *gpuPointer = f(args...);
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Destroys the device pointer.
+      ///
+      /// @param[out] gpuPointer The device pointer to call delete on
+      ///
+      template <typename T>
+      __global__ void destroy_on_device(T* gpuPointer)
+      {
+         if (gpuPointer) {
+            delete gpuPointer;
+         }
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new T on the device.
+      ///
+      /// @param[in]  args The arguments to T's constructor
+      ///
+      /// @return The device pointer to the new T
+      ///
+      template <typename T,
+                typename... Args>
+      CHAI_HOST T* make_on_device(Args... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that ManagedArrays and managed_ptrs
+         // are handled properly
+         arrayManager->setExecutionSpace(GPU);
+#endif
+
+         // Allocate space on the GPU to hold the pointer to the new object
+         T** gpuBuffer;
+         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
+
+         // Create the object on the device
+         make_on_device<<<1, 1>>>(gpuBuffer, args...);
+         debug_cudaDeviceSynchronize();
+
+         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+         T** cpuBuffer = (T**) malloc(sizeof(T*));
+         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
+                                    cudaMemcpyDeviceToHost));
+
+         // Get the GPU pointer
+         T* gpuPointer = cpuBuffer[0];
+
+         // Free the host and device buffers
+         free(cpuBuffer);
+         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the GPU pointer
+         return gpuPointer;
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Calls a factory method to create a new object on the device.
+      ///
+      /// @param[in]  f    The factory method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @return The device pointer to the new object
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that chai::ManagedArrays and
+         // chai::managed_ptrs are handled properly
+         arrayManager->setExecutionSpace(GPU);
+#endif
+
+         // Allocate space on the GPU to hold the pointer to the new object
+         T** gpuBuffer;
+         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
+
+         // Create the object on the device
+         make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
+         debug_cudaDeviceSynchronize();
+
+         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+         T** cpuBuffer = (T**) malloc(sizeof(T*));
+         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
+                                    cudaMemcpyDeviceToHost));
+
+         // Get the GPU pointer
+         T* gpuPointer = cpuBuffer[0];
+
+         // Free the host and device buffers
+         free(cpuBuffer);
+         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the GPU pointer
+         return gpuPointer;
+      }
+
+#endif
+
+      // Adapted from "The C++ Programming Language," Fourth Edition,
+      // by Bjarne Stroustrup, pp. 814-816
+      // Used to determine if a functor is callable with the given arguments
+      struct substitution_failure {};
+
+      template <typename T>
+      struct substitution_succeeded : std::true_type {};
+
+      template<>
+      struct substitution_succeeded<substitution_failure> : std::false_type {};
+
+      template <typename F, typename... Args>
+      struct is_invocable_impl {
+         private:
+            template <typename X, typename... Ts>
+            static auto check(X const& x, Ts&&... ts) -> decltype(x(ts...));
+            static substitution_failure check(...);
+         public:
+            using type = decltype(check(std::declval<F>(), std::declval<Args>()...));
+      };
+
+      template <typename F, typename... Args>
+      struct is_invocable : substitution_succeeded<typename is_invocable_impl<F, Args...>::type> {};
+   } // namespace detail
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a managed_ptr<T>.
+   /// Factory function to create managed_ptrs.
+   ///
+   /// @param[in] args The arguments to T's constructor
+   ///
+   template <typename T,
+             typename... Args>
+   CHAI_HOST managed_ptr<T> make_managed(Args... args) {
+#ifdef __CUDACC__
+      // Construct on the GPU first to take advantage of asynchrony
+      T* gpuPointer = detail::make_on_device<T>(args...);
+#endif
+
+      // Construct on the CPU
+      T* cpuPointer = detail::make_on_host<T>(args...);
+
+      // Construct and return the managed_ptr
+#ifdef __CUDACC__
+      return managed_ptr<T>({CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>({CPU}, {cpuPointer});
+#endif
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a managed_ptr<T>.
+   /// Factory function to create managed_ptrs.
+   ///
+   /// @param[in] f The factory function that will create the object
+   /// @param[in] args The arguments to the factory function
+   ///
+   template <typename T,
+             typename F,
+             typename... Args>
+   CHAI_HOST managed_ptr<T> make_managed_from_factory(F&& f, Args&&... args) {
+      static_assert(detail::is_invocable<F, Args...>::value,
+                    "F is not invocable with the given arguments.");
+
+      static_assert(std::is_pointer<typename std::result_of<F(Args...)>::type>::value,
+                    "F does not return a pointer.");
+
+      using R = typename std::remove_pointer<typename std::result_of<F(Args...)>::type>::type;
+
+      static_assert(std::is_convertible<R*, T*>::value,
+                    "F does not return a pointer that is convertible to T*.");
+
+#ifdef __CUDACC__
+      // Construct on the GPU first to take advantage of asynchrony
+      T* gpuPointer = detail::make_on_device_from_factory<R>(f, args...);
+#endif
+
+      // Construct on the CPU
+      T* cpuPointer = detail::make_on_host_from_factory<R>(f, args...);
+
+      // Construct and return the managed_ptr
+#ifdef __CUDACC__
+      return managed_ptr<T>({CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>({CPU}, {cpuPointer});
+#endif
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a new managed_ptr that shares ownership with the given managed_ptr, but
+   ///    the underlying pointer is converted using static_cast.
+   ///
+   /// @param[in] other The managed_ptr to share ownership with and whose pointer to
+   ///                      convert using static_cast
+   ///
+   template <typename T, typename U>
+   CHAI_HOST managed_ptr<T> static_pointer_cast(const managed_ptr<U>& other) noexcept {
+      T* cpuPointer = static_cast<T*>(other.get());
+
+#ifdef __CUDACC__
+      T* gpuPointer = static_cast<T*>(other.get(GPU, false));
+
+      return managed_ptr<T>(other, {CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>(other, {CPU}, {cpuPointer});
+#endif
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a new managed_ptr that shares ownership with the given managed_ptr, but
+   ///    the underlying pointer is converted using dynamic_cast.
+   ///
+   /// @param[in] other The managed_ptr to share ownership with and whose pointer to
+   ///                      convert using dynamic_cast
+   ///
+   template <typename T, typename U>
+   CHAI_HOST managed_ptr<T> dynamic_pointer_cast(const managed_ptr<U>& other) noexcept {
+      T* cpuPointer = dynamic_cast<T*>(other.get());
+
+#ifdef __CUDACC__
+      T* gpuPointer = nullptr;
+
+      if (cpuPointer) {
+         gpuPointer = static_cast<T*>(other.get(GPU, false));
+      }
+
+      return managed_ptr<T>(other, {CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>(other, {CPU}, {cpuPointer});
+#endif
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a new managed_ptr that shares ownership with the given managed_ptr, but
+   ///    the underlying pointer is converted using const_cast.
+   ///
+   /// @param[in] other The managed_ptr to share ownership with and whose pointer to
+   ///                      convert using const_cast
+   ///
+   template <typename T, typename U>
+   CHAI_HOST managed_ptr<T> const_pointer_cast(const managed_ptr<U>& other) noexcept {
+      T* cpuPointer = const_cast<T*>(other.get());
+
+#ifdef __CUDACC__
+      T* gpuPointer = const_cast<T*>(other.get(GPU, false));
+
+      return managed_ptr<T>(other, {CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>(other, {CPU}, {cpuPointer});
+#endif
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a new managed_ptr that shares ownership with the given managed_ptr, but
+   ///    the underlying pointer is converted using reinterpret_cast.
+   ///
+   /// @param[in] other The managed_ptr to share ownership with and whose pointer to
+   ///                      convert using reinterpret_cast
+   ///
+   template <typename T, typename U>
+   CHAI_HOST managed_ptr<T> reinterpret_pointer_cast(const managed_ptr<U>& other) noexcept {
+      T* cpuPointer = reinterpret_cast<T*>(other.get());
+
+#ifdef __CUDACC__
+      T* gpuPointer = reinterpret_cast<T*>(other.get(GPU, false));
+
+      return managed_ptr<T>(other, {CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>(other, {CPU}, {cpuPointer});
+#endif
+   }
+
+   /// Comparison operators
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Equals comparison.
+   ///
+   /// @param[in] lhs The first managed_ptr to compare
+   /// @param[in] rhs The second managed_ptr to compare
+   ///
+   template <typename T, typename U>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator==(const managed_ptr<T>& lhs, const managed_ptr<U>& rhs) noexcept {
+      return lhs.get() == rhs.get();
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Not equals comparison.
+   ///
+   /// @param[in] lhs The first managed_ptr to compare
+   /// @param[in] rhs The second managed_ptr to compare
+   ///
+   template <typename T, typename U>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator!=(const managed_ptr<T>& lhs, const managed_ptr<U>& rhs) noexcept {
+      return lhs.get() != rhs.get();
+   }
+
+   /// Comparison operators with nullptr
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Equals comparison with nullptr.
+   ///
+   /// @param[in] lhs The managed_ptr to compare to nullptr
+   ///
+   template <typename T>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator==(const managed_ptr<T>& lhs, std::nullptr_t) noexcept {
+      return lhs.get() == nullptr;
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Equals comparison with nullptr.
+   ///
+   /// @param[in] rhs The managed_ptr to compare to nullptr
+   ///
+   template <typename T>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator==(std::nullptr_t, const managed_ptr<T>& rhs) noexcept {
+      return nullptr == rhs.get();
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Not equals comparison with nullptr.
+   ///
+   /// @param[in] lhs The managed_ptr to compare to nullptr
+   ///
+   template <typename T>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator!=(const managed_ptr<T>& lhs, std::nullptr_t) noexcept {
+      return lhs.get() != nullptr;
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Not equals comparison with nullptr.
+   ///
+   /// @param[in] rhs The managed_ptr to compare to nullptr
+   ///
+   template <typename T>
+   CHAI_HOST_DEVICE CHAI_INLINE
+   bool operator!=(std::nullptr_t, const managed_ptr<T>& rhs) noexcept {
+      return nullptr != rhs.get();
+   }
+
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Not equals comparison.
+   ///
+   /// @param[in] lhs The first managed_ptr to swap
+   /// @param[in] rhs The second managed_ptr to swap
+   ///
+   template <typename T>
+   void swap(managed_ptr<T>& lhs, managed_ptr<T>& rhs) noexcept {
+      std::swap(lhs.m_cpu_pointer, rhs.m_cpu_pointer);
+      std::swap(lhs.m_gpu_pointer, rhs.m_gpu_pointer);
+      std::swap(lhs.m_pointer_record, rhs.m_pointer_record);
+   }
+} // namespace chai
+
+#endif // MANAGED_PTR
+

From 89288c5f2271801686c73782ea52a15906b47996 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Mon, 30 Sep 2019 13:41:42 -0700
Subject: [PATCH 22/58] Add managed_ptr unit tests

---
 tests/unit/CMakeLists.txt             |   44 +-
 tests/unit/managed_ptr_unit_tests.cpp | 1086 +++++++++++++++++++++++++
 2 files changed, 1107 insertions(+), 23 deletions(-)
 create mode 100644 tests/unit/managed_ptr_unit_tests.cpp

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index fea76c55..835aded5 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -41,33 +41,17 @@
 # POSSIBILITY OF SUCH DAMAGE.
 #######################################################################
 
-set (managed_array_test_depends
-  chai umpire gtest)
+# Unit test dependencies
+set (chai_unit_test_depends
+     chai umpire gtest)
 
-set (array_manager_test_depends
-  chai umpire gtest)
-
-if (ENABLE_CUDA)
-  set (managed_array_test_depends
-    ${managed_array_test_depends}
-    cuda)
-  set (array_manager_test_depends
-    ${array_manager_test_depends}
-    cuda)
-endif ()
-if (ENABLE_HIP)
-  set (managed_array_test_depends
-    ${managed_array_test_depends}
-    hip)
-  set (array_manager_test_depends
-    ${array_manager_test_depends}
-    hip)
-endif ()
+blt_list_append(TO chai_unit_test_depends ELEMENTS cuda IF ${ENABLE_CUDA})
 
+# ManagedArray tests
 blt_add_executable(
   NAME managed_array_unit_tests
   SOURCES managed_array_unit_tests.cpp
-  DEPENDS_ON ${managed_array_test_depends})
+  DEPENDS_ON ${chai_unit_test_depends})
 
 target_include_directories(
   managed_array_unit_tests
@@ -81,7 +65,7 @@ blt_add_test(
 blt_add_executable(
   NAME array_manager_unit_tests
   SOURCES array_manager_unit_tests.cpp
-  DEPENDS_ON ${array_manager_test_depends})
+  DEPENDS_ON ${chai_unit_test_depends})
 
 target_include_directories(
   array_manager_unit_tests
@@ -91,3 +75,17 @@ blt_add_test(
   NAME array_manager_unit_test
   COMMAND array_manager_unit_tests)
 
+# managed_ptr tests
+blt_add_executable(
+  NAME managed_ptr_unit_tests
+  SOURCES managed_ptr_unit_tests.cpp
+  DEPENDS_ON ${chai_unit_test_depends})
+
+target_include_directories(
+  managed_ptr_unit_tests
+  PUBLIC ${PROJECT_BINARY_DIR}/include)
+
+blt_add_test(
+  NAME managed_ptr_unit_test
+  COMMAND managed_ptr_unit_tests)
+
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
new file mode 100644
index 00000000..c6abf86e
--- /dev/null
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -0,0 +1,1086 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+//
+// Produced at the Lawrence Livermore National Laboratory.
+//
+// This file is part of CHAI.
+//
+// LLNL-CODE-705877
+//
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#include "gtest/gtest.h"
+
+#define CUDA_TEST(X, Y)              \
+  static void cuda_test_##X_##Y();    \
+  TEST(X, Y) { cuda_test_##X_##Y(); } \
+  static void cuda_test_##X_##Y()
+
+#include "chai/config.hpp"
+#include "chai/ManagedArray.hpp"
+#include "chai/managed_ptr.hpp"
+
+#include "../src/util/forall.hpp"
+
+// Standard library headers
+#include <cstdlib>
+
+class Simple {
+   public:
+      CHAI_HOST_DEVICE Simple() : m_value(-1) {}
+      CHAI_HOST_DEVICE Simple(int value) : m_value(value) {}
+      CHAI_HOST_DEVICE ~Simple() {}
+
+      CHAI_HOST_DEVICE Simple(Simple const & other) : m_value(other.m_value) {}
+
+      CHAI_HOST_DEVICE Simple& operator=(Simple const & other) {
+         m_value = other.m_value;
+         return *this;
+      }
+
+      CHAI_HOST_DEVICE Simple(Simple&& other) : m_value(other.m_value) {
+         other.m_value = -1;
+      }
+
+      CHAI_HOST_DEVICE Simple& operator=(Simple&& other) {
+         m_value = other.m_value;
+         other.m_value = -1;
+         return *this;
+      }
+
+      CHAI_HOST_DEVICE int getValue() { return m_value; }
+
+   private:
+      int m_value;
+};
+
+class TestBase {
+   public:
+      CHAI_HOST_DEVICE TestBase() {}
+      CHAI_HOST_DEVICE virtual ~TestBase() {}
+
+      CHAI_HOST_DEVICE static TestBase* Factory(const int value);
+
+      CHAI_HOST_DEVICE virtual int getValue() const = 0;
+};
+
+class TestDerived : public TestBase {
+   public:
+      CHAI_HOST_DEVICE TestDerived() : TestBase(), m_value(0) {}
+      CHAI_HOST_DEVICE TestDerived(const int value) : TestBase(), m_value(value) {}
+
+      CHAI_HOST_DEVICE virtual ~TestDerived() {}
+
+      CHAI_HOST_DEVICE virtual int getValue() const { return m_value; }
+
+   private:
+      int m_value;
+};
+
+CHAI_HOST_DEVICE TestBase* TestBase::Factory(const int value) {
+   return new TestDerived(value);
+}
+
+CHAI_HOST_DEVICE TestBase* Factory(const int value) {
+   return new TestDerived(value);
+}
+
+CHAI_HOST_DEVICE TestBase* OverloadedFactory() {
+   return new TestDerived(-1);
+}
+
+CHAI_HOST_DEVICE TestBase* OverloadedFactory(const int value) {
+   return new TestDerived(value);
+}
+
+
+TEST(managed_ptr, default_constructor)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestDerived> otherDerived;
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_FALSE(derived);
+  EXPECT_TRUE(derived == nullptr);
+  EXPECT_TRUE(nullptr == derived);
+  EXPECT_FALSE(derived != nullptr);
+  EXPECT_FALSE(nullptr != derived);
+  EXPECT_TRUE(derived == otherDerived);
+  EXPECT_TRUE(otherDerived == derived);
+  EXPECT_FALSE(derived != otherDerived);
+  EXPECT_FALSE(otherDerived != derived);
+}
+
+TEST(managed_ptr, nullptr_constructor)
+{
+  chai::managed_ptr<TestDerived> derived = nullptr;
+  chai::managed_ptr<TestDerived> otherDerived = nullptr;
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_FALSE(derived);
+  EXPECT_TRUE(derived == nullptr);
+  EXPECT_TRUE(nullptr == derived);
+  EXPECT_FALSE(derived != nullptr);
+  EXPECT_FALSE(nullptr != derived);
+  EXPECT_TRUE(derived == otherDerived);
+  EXPECT_TRUE(otherDerived == derived);
+  EXPECT_FALSE(derived != otherDerived);
+  EXPECT_FALSE(otherDerived != derived);
+}
+
+TEST(managed_ptr, cpu_pointer_constructor)
+{
+  TestDerived* cpuPointer = new TestDerived(3);
+  chai::managed_ptr<TestDerived> derived({chai::CPU}, {cpuPointer});
+
+  EXPECT_EQ(derived->getValue(), 3);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+TEST(managed_ptr, make_managed)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+TEST(managed_ptr, copy_constructor)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestDerived> otherDerived(derived);
+
+  EXPECT_EQ(derived->getValue(), expectedValue);
+  EXPECT_EQ(otherDerived->getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+  EXPECT_TRUE(derived == otherDerived);
+  EXPECT_FALSE(derived != otherDerived);
+
+  EXPECT_NE(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 2);
+  EXPECT_TRUE(otherDerived);
+  EXPECT_FALSE(otherDerived == nullptr);
+  EXPECT_FALSE(nullptr == otherDerived);
+  EXPECT_TRUE(otherDerived != nullptr);
+  EXPECT_TRUE(nullptr != otherDerived);
+  EXPECT_TRUE(otherDerived == derived);
+  EXPECT_FALSE(otherDerived != derived);
+}
+
+TEST(managed_ptr, converting_constructor)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestBase> base = derived;
+
+  EXPECT_EQ(derived->getValue(), expectedValue);
+  EXPECT_EQ(base->getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+  EXPECT_TRUE(derived == base);
+  EXPECT_FALSE(derived != base);
+
+  EXPECT_NE(base.get(), nullptr);
+  EXPECT_EQ(base.use_count(), 2);
+  EXPECT_TRUE(base);
+  EXPECT_FALSE(base == nullptr);
+  EXPECT_FALSE(nullptr == base);
+  EXPECT_TRUE(base != nullptr);
+  EXPECT_TRUE(nullptr != base);
+  EXPECT_TRUE(base == derived);
+  EXPECT_FALSE(base != derived);
+}
+
+TEST(managed_ptr, copy_assignment_operator)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestDerived> otherDerived;
+  otherDerived = derived;
+
+  EXPECT_EQ(derived->getValue(), expectedValue);
+  EXPECT_EQ(otherDerived->getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+  EXPECT_TRUE(derived == otherDerived);
+  EXPECT_FALSE(derived != otherDerived);
+
+  EXPECT_NE(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 2);
+  EXPECT_TRUE(otherDerived);
+  EXPECT_FALSE(otherDerived == nullptr);
+  EXPECT_FALSE(nullptr == otherDerived);
+  EXPECT_TRUE(otherDerived != nullptr);
+  EXPECT_TRUE(nullptr != otherDerived);
+  EXPECT_TRUE(otherDerived == derived);
+  EXPECT_FALSE(otherDerived != derived);
+}
+
+TEST(managed_ptr, copy_constructor_from_default_constructed)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestDerived> otherDerived(derived);
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_EQ(bool(derived), false);
+  EXPECT_EQ(derived, nullptr);
+  EXPECT_EQ(nullptr, derived);
+
+  EXPECT_EQ(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 0);
+  EXPECT_EQ(bool(otherDerived), false);
+  EXPECT_EQ(otherDerived, nullptr);
+  EXPECT_EQ(nullptr, otherDerived);
+}
+
+TEST(managed_ptr, copy_assignment_operator_from_default_constructed)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestDerived> otherDerived;
+  otherDerived = derived;
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_EQ(bool(derived), false);
+  EXPECT_EQ(derived, nullptr);
+  EXPECT_EQ(nullptr, derived);
+
+  EXPECT_EQ(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 0);
+  EXPECT_EQ(bool(otherDerived), false);
+  EXPECT_EQ(otherDerived, nullptr);
+  EXPECT_EQ(nullptr, otherDerived);
+}
+
+TEST(managed_ptr, conversion_copy_constructor_from_default_constructed)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestBase> otherDerived(derived);
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_EQ(bool(derived), false);
+  EXPECT_EQ(derived, nullptr);
+  EXPECT_EQ(nullptr, derived);
+
+  EXPECT_EQ(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 0);
+  EXPECT_EQ(bool(otherDerived), false);
+  EXPECT_EQ(otherDerived, nullptr);
+  EXPECT_EQ(nullptr, otherDerived);
+}
+
+TEST(managed_ptr, conversion_copy_assignment_operator_from_default_constructed)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestBase> otherDerived;
+  otherDerived = derived;
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 0);
+  EXPECT_EQ(bool(derived), false);
+  EXPECT_EQ(derived, nullptr);
+  EXPECT_EQ(nullptr, derived);
+
+  EXPECT_EQ(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 0);
+  EXPECT_EQ(bool(otherDerived), false);
+  EXPECT_EQ(otherDerived, nullptr);
+  EXPECT_EQ(nullptr, otherDerived);
+}
+
+TEST(managed_ptr, copy_assignment_operator_from_host_ptr_constructed)
+{
+  const int expectedValue1 = rand();
+  const int expectedValue2 = rand();
+
+  chai::managed_ptr<TestDerived> derived = chai::make_managed<TestDerived>(expectedValue1);
+  chai::managed_ptr<TestDerived> otherDerived = chai::make_managed<TestDerived>(expectedValue2);
+  chai::managed_ptr<TestDerived> thirdDerived(otherDerived);
+
+  thirdDerived = derived;
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_EQ(bool(derived), true);
+  EXPECT_NE(derived, nullptr);
+  EXPECT_NE(nullptr, derived);
+
+  EXPECT_NE(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 1);
+  EXPECT_EQ(bool(otherDerived), true);
+  EXPECT_NE(otherDerived, nullptr);
+  EXPECT_NE(nullptr, otherDerived);
+
+  EXPECT_NE(thirdDerived.get(), nullptr);
+  EXPECT_EQ(thirdDerived.use_count(), 2);
+  EXPECT_EQ(bool(thirdDerived), true);
+  EXPECT_NE(thirdDerived, nullptr);
+  EXPECT_NE(nullptr, thirdDerived);
+}
+
+TEST(managed_ptr, conversion_copy_assignment_operator_from_host_ptr_constructed)
+{
+  const int expectedValue1 = rand();
+  const int expectedValue2 = rand();
+
+  chai::managed_ptr<TestDerived> derived = chai::make_managed<TestDerived>(expectedValue1);
+  chai::managed_ptr<TestDerived> otherDerived = chai::make_managed<TestDerived>(expectedValue2);
+  chai::managed_ptr<TestBase> thirdDerived(otherDerived);
+
+  thirdDerived = derived;
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_EQ(bool(derived), true);
+  EXPECT_NE(derived, nullptr);
+  EXPECT_NE(nullptr, derived);
+
+  EXPECT_NE(otherDerived.get(), nullptr);
+  EXPECT_EQ(otherDerived.use_count(), 1);
+  EXPECT_EQ(bool(otherDerived), true);
+  EXPECT_NE(otherDerived, nullptr);
+  EXPECT_NE(nullptr, otherDerived);
+
+  EXPECT_NE(thirdDerived.get(), nullptr);
+  EXPECT_EQ(thirdDerived.use_count(), 2);
+  EXPECT_EQ(bool(thirdDerived), true);
+  EXPECT_NE(thirdDerived, nullptr);
+  EXPECT_NE(nullptr, thirdDerived);
+}
+
+TEST(managed_ptr, static_pointer_cast)
+{
+  TestDerived* cpuPointer = new TestDerived(3);
+  chai::managed_ptr<TestDerived> derived({chai::CPU}, {cpuPointer});
+
+  auto base = chai::static_pointer_cast<TestBase>(derived);
+
+  EXPECT_EQ(base->getValue(), 3);
+
+  EXPECT_NE(base.get(), nullptr);
+  EXPECT_EQ(base.use_count(), 2);
+  EXPECT_TRUE(base);
+  EXPECT_FALSE(base == nullptr);
+  EXPECT_FALSE(nullptr == base);
+  EXPECT_TRUE(base != nullptr);
+  EXPECT_TRUE(nullptr != base);
+}
+
+TEST(managed_ptr, dynamic_pointer_cast)
+{
+  TestDerived* cpuPointer = new TestDerived(3);
+  chai::managed_ptr<TestBase> base({chai::CPU}, {cpuPointer});
+
+  auto derived = chai::dynamic_pointer_cast<TestDerived>(base);
+
+  EXPECT_EQ(derived->getValue(), 3);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+TEST(managed_ptr, const_pointer_cast)
+{
+  TestDerived* cpuPointer = new TestDerived(3);
+  chai::managed_ptr<const TestBase> base({chai::CPU}, {cpuPointer});
+
+  auto nonConstBase = chai::const_pointer_cast<TestBase>(base);
+
+  EXPECT_EQ(nonConstBase->getValue(), 3);
+
+  EXPECT_NE(nonConstBase.get(), nullptr);
+  EXPECT_EQ(nonConstBase.use_count(), 2);
+  EXPECT_TRUE(nonConstBase);
+  EXPECT_FALSE(nonConstBase == nullptr);
+  EXPECT_FALSE(nullptr == nonConstBase);
+  EXPECT_TRUE(nonConstBase != nullptr);
+  EXPECT_TRUE(nullptr != nonConstBase);
+}
+
+TEST(managed_ptr, reinterpret_pointer_cast)
+{
+  TestDerived* cpuPointer = new TestDerived(3);
+  chai::managed_ptr<TestBase> base({chai::CPU}, {cpuPointer});
+
+  auto derived = chai::reinterpret_pointer_cast<TestDerived>(base);
+
+  EXPECT_EQ(derived->getValue(), 3);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 2);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+#ifdef __CUDACC__
+
+CUDA_TEST(managed_ptr, cuda_default_constructor)
+{
+  chai::managed_ptr<TestDerived> derived;
+  chai::managed_ptr<TestDerived> otherDerived;
+
+  chai::ManagedArray<TestDerived*> array(1, chai::GPU);
+  chai::ManagedArray<bool> array2(9, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived.get();
+    array2[0] = (bool) derived;
+    array2[1] = derived == nullptr;
+    array2[2] = nullptr == derived;
+    array2[3] = derived != nullptr;
+    array2[4] = nullptr != derived;
+    array2[5] = derived == otherDerived;
+    array2[6] = otherDerived == derived;
+    array2[7] = derived != otherDerived;
+    array2[8] = otherDerived != derived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+
+  EXPECT_EQ(array[0], nullptr);
+  EXPECT_FALSE(array2[0]);
+  EXPECT_TRUE(array2[1]);
+  EXPECT_TRUE(array2[2]);
+  EXPECT_FALSE(array2[3]);
+  EXPECT_FALSE(array2[4]);
+  EXPECT_TRUE(array2[5]);
+  EXPECT_TRUE(array2[6]);
+  EXPECT_FALSE(array2[7]);
+  EXPECT_FALSE(array2[8]);
+}
+
+CUDA_TEST(managed_ptr, cuda_nullptr_constructor)
+{
+  chai::managed_ptr<TestDerived> derived = nullptr;
+  chai::managed_ptr<TestDerived> otherDerived = nullptr;
+
+  chai::ManagedArray<TestDerived*> array(1, chai::GPU);
+  chai::ManagedArray<bool> array2(9, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived.get();
+    array2[0] = (bool) derived;
+    array2[1] = derived == nullptr;
+    array2[2] = nullptr == derived;
+    array2[3] = derived != nullptr;
+    array2[4] = nullptr != derived;
+    array2[5] = derived == otherDerived;
+    array2[6] = otherDerived == derived;
+    array2[7] = derived != otherDerived;
+    array2[8] = otherDerived != derived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+
+  EXPECT_EQ(array[0], nullptr);
+  EXPECT_FALSE(array2[0]);
+  EXPECT_TRUE(array2[1]);
+  EXPECT_TRUE(array2[2]);
+  EXPECT_FALSE(array2[3]);
+  EXPECT_FALSE(array2[4]);
+  EXPECT_TRUE(array2[5]);
+  EXPECT_TRUE(array2[6]);
+  EXPECT_FALSE(array2[7]);
+  EXPECT_FALSE(array2[8]);
+}
+
+CUDA_TEST(managed_ptr, cuda_gpu_pointer_constructor)
+{
+  TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
+  chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
+
+  EXPECT_EQ(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_FALSE(derived);
+  EXPECT_TRUE(derived == nullptr);
+  EXPECT_TRUE(nullptr == derived);
+  EXPECT_FALSE(derived != nullptr);
+  EXPECT_FALSE(nullptr != derived);
+
+  chai::ManagedArray<int> array1(1, chai::GPU);
+  chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
+  chai::ManagedArray<bool> array3(5, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array1[i] = derived->getValue();
+    array2[i] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+  });
+
+  array1.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array1[0], 3);
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+}
+
+CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
+{
+  // Initialize host side memory to hold a pointer
+  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  Simple** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
+
+  // Create on the device
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  Simple* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
+
+  // Initialize more host side memory
+  Simple** cpuPointerHolder2 = (Simple**) malloc(sizeof(Simple*));
+  cpuPointerHolder2[0] = gpuPointer;
+
+  // Initialize more device side memory
+  Simple** gpuPointerHolder2 = nullptr;
+  cudaMalloc(&gpuPointerHolder2, sizeof(Simple*));
+
+  // Copy pointer back to the device
+  cudaMemcpy(gpuPointerHolder2, cpuPointerHolder2, sizeof(Simple*),
+             cudaMemcpyHostToDevice);
+
+  chai::detail::destroy_on_device<<<1, 1>>>(gpuPointerHolder2);
+
+  // Free host memory
+  free(cpuPointerHolder2);
+
+  // Free device memory
+  cudaFree(gpuPointerHolder2);
+}
+
+CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device_2)
+{
+  // Initialize host side memory to hold a pointer
+  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  Simple** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
+
+  // Create on the device
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  Simple* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
+
+  chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
+}
+
+CUDA_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
+{
+  Simple* gpuPointer = chai::detail::make_on_device<Simple>(3);
+  Simple* cpuPointer = new Simple(4);
+
+  chai::managed_ptr<Simple> simple({chai::GPU, chai::CPU}, {gpuPointer, cpuPointer});
+
+  EXPECT_EQ(simple->getValue(), 4);
+
+  chai::ManagedArray<int> array1(1, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array1[i] = simple->getValue();
+  });
+
+  array1.move(chai::CPU);
+
+  cudaDeviceSynchronize();
+
+  EXPECT_EQ(array1[0], 3);
+}
+
+CUDA_TEST(managed_ptr, cuda_cpu_and_gpu_pointer_constructor)
+{
+  TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
+  TestDerived* cpuPointer = new TestDerived(4);
+
+  chai::managed_ptr<TestDerived> derived({chai::GPU, chai::CPU}, {gpuPointer, cpuPointer});
+
+  EXPECT_EQ(derived->getValue(), 4);
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+
+  chai::ManagedArray<int> array1(1, chai::GPU);
+  chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
+  chai::ManagedArray<bool> array3(5, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array1[i] = derived->getValue();
+    array2[i] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+  });
+
+  array1.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array1[0], 3);
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+}
+
+CUDA_TEST(managed_ptr, cuda_make_managed)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+
+  chai::ManagedArray<int> array(1, chai::GPU);
+  chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
+  chai::ManagedArray<bool> array3(7, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived->getValue();
+    array2[i] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array[0], expectedValue);
+
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+}
+
+CUDA_TEST(managed_ptr, make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return Factory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+CUDA_TEST(managed_ptr, make_managed_from_factory_lambda)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+CUDA_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return OverloadedFactory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+CUDA_TEST(managed_ptr, make_managed_from_factory_static_member_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return TestBase::Factory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_EQ(derived.use_count(), 1);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+}
+
+CUDA_TEST(managed_ptr, cuda_copy_constructor)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestDerived> otherDerived(derived);
+
+  chai::ManagedArray<int> array(2, chai::GPU);
+  chai::ManagedArray<TestDerived*> array2(2, chai::GPU);
+  chai::ManagedArray<bool> array3(14, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived->getValue();
+    array2[0] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+    array3[5] = derived == otherDerived;
+    array3[6] = derived != otherDerived;
+
+    array[1] = otherDerived->getValue();
+    array2[1] = otherDerived.get();
+    array3[7] = (bool) derived;
+    array3[8] = derived == nullptr;
+    array3[9] = nullptr == derived;
+    array3[10] = derived != nullptr;
+    array3[11] = nullptr != derived;
+    array3[12] = derived == otherDerived;
+    array3[13] = derived != otherDerived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array[0], expectedValue);
+  EXPECT_EQ(array[1], expectedValue);
+
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+  EXPECT_TRUE(array3[5]);
+  EXPECT_FALSE(array3[6]);
+
+  EXPECT_NE(array2[1], nullptr);
+  EXPECT_TRUE(array3[7]);
+  EXPECT_FALSE(array3[8]);
+  EXPECT_FALSE(array3[9]);
+  EXPECT_TRUE(array3[10]);
+  EXPECT_TRUE(array3[11]);
+  EXPECT_TRUE(array3[12]);
+  EXPECT_FALSE(array3[13]);
+}
+
+CUDA_TEST(managed_ptr, cuda_converting_constructor)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestBase> base(derived);
+
+  chai::ManagedArray<int> array(2, chai::GPU);
+  chai::ManagedArray<TestBase*> array2(2, chai::GPU);
+  chai::ManagedArray<bool> array3(14, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived->getValue();
+    array2[0] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+    array3[5] = derived == base;
+    array3[6] = derived != base;
+
+    array[1] = base->getValue();
+    array2[1] = base.get();
+    array3[7] = (bool) base;
+    array3[8] = base == nullptr;
+    array3[9] = nullptr == base;
+    array3[10] = base != nullptr;
+    array3[11] = nullptr != base;
+    array3[12] = base == derived;
+    array3[13] = base != derived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array[0], expectedValue);
+  EXPECT_EQ(array[1], expectedValue);
+
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+  EXPECT_TRUE(array3[5]);
+  EXPECT_FALSE(array3[6]);
+
+  EXPECT_NE(array2[1], nullptr);
+  EXPECT_TRUE(array3[7]);
+  EXPECT_FALSE(array3[8]);
+  EXPECT_FALSE(array3[9]);
+  EXPECT_TRUE(array3[10]);
+  EXPECT_TRUE(array3[11]);
+  EXPECT_TRUE(array3[12]);
+  EXPECT_FALSE(array3[13]);
+}
+
+CUDA_TEST(managed_ptr, cuda_copy_assignment_operator)
+{
+  const int expectedValue = rand();
+  auto derived = chai::make_managed<TestDerived>(expectedValue);
+  chai::managed_ptr<TestDerived> otherDerived;
+  otherDerived = derived;
+
+  chai::ManagedArray<int> array(2, chai::GPU);
+  chai::ManagedArray<TestDerived*> array2(2, chai::GPU);
+  chai::ManagedArray<bool> array3(14, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    array[i] = derived->getValue();
+    array2[0] = derived.get();
+    array3[0] = (bool) derived;
+    array3[1] = derived == nullptr;
+    array3[2] = nullptr == derived;
+    array3[3] = derived != nullptr;
+    array3[4] = nullptr != derived;
+    array3[5] = derived == otherDerived;
+    array3[6] = derived != otherDerived;
+
+    array[1] = otherDerived->getValue();
+    array2[1] = otherDerived.get();
+    array3[7] = (bool) derived;
+    array3[8] = derived == nullptr;
+    array3[9] = nullptr == derived;
+    array3[10] = derived != nullptr;
+    array3[11] = nullptr != derived;
+    array3[12] = derived == otherDerived;
+    array3[13] = derived != otherDerived;
+  });
+
+  array.move(chai::CPU);
+  array2.move(chai::CPU);
+  array3.move(chai::CPU);
+
+  EXPECT_EQ(array[0], expectedValue);
+  EXPECT_EQ(array[1], expectedValue);
+
+  EXPECT_NE(array2[0], nullptr);
+  EXPECT_TRUE(array3[0]);
+  EXPECT_FALSE(array3[1]);
+  EXPECT_FALSE(array3[2]);
+  EXPECT_TRUE(array3[3]);
+  EXPECT_TRUE(array3[4]);
+  EXPECT_TRUE(array3[5]);
+  EXPECT_FALSE(array3[6]);
+
+  EXPECT_NE(array2[1], nullptr);
+  EXPECT_TRUE(array3[7]);
+  EXPECT_FALSE(array3[8]);
+  EXPECT_FALSE(array3[9]);
+  EXPECT_TRUE(array3[10]);
+  EXPECT_TRUE(array3[11]);
+  EXPECT_TRUE(array3[12]);
+  EXPECT_FALSE(array3[13]);
+}
+
+#endif
+
+// Enable the following tests to ensure that proper compiler errors are given
+// for bad arguments since otherwise it is difficult to make sure the template
+// metaprogramming is correct.
+
+#if 0
+
+// Should give something like the following:
+// error: static assertion failed: F is not invocable with the given arguments.
+
+TEST(managed_ptr, bad_function_to_make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(expectedValue, factory);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+}
+
+#endif
+
+#if 0
+
+// Should give something like the following:
+// error: static assertion failed: F is not invocable with the given arguments.
+
+TEST(managed_ptr, bad_arguments_to_make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue, 3);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+}
+
+#endif
+

From 97da33ed7261506537e6ae6507d22791840914d8 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Mon, 30 Sep 2019 17:30:08 -0700
Subject: [PATCH 23/58] Add managed_ptr integration tests

---
 tests/integration/CMakeLists.txt        |  37 +-
 tests/integration/managed_ptr_tests.cpp | 717 ++++++++++++++++++++++++
 2 files changed, 740 insertions(+), 14 deletions(-)
 create mode 100644 tests/integration/managed_ptr_tests.cpp

diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 22cbdd04..509079a5 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -1,21 +1,15 @@
-set (managed_array_test_depends
-  chai umpire gtest)
-
-if (ENABLE_CUDA)
-  set (managed_array_test_depends
-    ${managed_array_test_depends}
-    cuda)
-endif ()
-if (ENABLE_HIP)
-  set (managed_array_test_depends
-    ${managed_array_test_depends}
-    hip)
-endif ()
+# Integration test dependencies
+set (chai_integration_test_depends
+     chai umpire gtest)
 
+blt_list_append(TO chai_integration_test_depends ELEMENTS cuda IF ${ENABLE_CUDA})
+blt_list_append(TO chai_integration_test_depends ELEMENTS hip IF ${ENABLE_HIP})
+
+# ManagedArray tests
 blt_add_executable(
   NAME managed_array_tests
   SOURCES managed_array_tests.cpp
-  DEPENDS_ON ${managed_array_test_depends})
+  DEPENDS_ON ${chai_integration_test_depends})
 
 target_include_directories(
   managed_array_tests
@@ -24,3 +18,18 @@ target_include_directories(
 blt_add_test(
   NAME managed_array_test
   COMMAND managed_array_tests)
+
+# managed_ptr tests
+blt_add_executable(
+  NAME managed_ptr_tests
+  SOURCES managed_ptr_tests.cpp
+  DEPENDS_ON ${chai_integration_test_depends})
+
+target_include_directories(
+  managed_ptr_tests
+  PUBLIC ${PROJECT_BINARY_DIR}/include)
+
+blt_add_test(
+  NAME managed_ptr_test
+  COMMAND managed_ptr_tests)
+
diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
new file mode 100644
index 00000000..f5d22680
--- /dev/null
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -0,0 +1,717 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+//
+// Produced at the Lawrence Livermore National Laboratory.
+//
+// This file is part of CHAI.
+//
+// LLNL-CODE-705877
+//
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#include "gtest/gtest.h"
+
+#define CUDA_TEST(X, Y)              \
+  static void cuda_test_##X##Y();    \
+  TEST(X, Y) { cuda_test_##X##Y(); } \
+  static void cuda_test_##X##Y()
+
+#include "chai/config.hpp"
+#include "chai/ManagedArray.hpp"
+#include "chai/managed_ptr.hpp"
+
+#include "../src/util/forall.hpp"
+
+// Standard library headers
+#include <cstdlib>
+
+class Base1 {
+   public:
+      CHAI_HOST_DEVICE Base1() {}
+      CHAI_HOST_DEVICE virtual ~Base1() {}
+
+      CHAI_HOST_DEVICE virtual bool isBase1() { return true; }
+};
+
+class Base2 {
+   public:
+      CHAI_HOST_DEVICE Base2() {}
+      CHAI_HOST_DEVICE virtual ~Base2() {}
+
+      CHAI_HOST_DEVICE virtual bool isBase2() { return true; }
+};
+
+class ClassWithMultipleInheritance : public Base1, public Base2 {
+   public:
+      CHAI_HOST_DEVICE ClassWithMultipleInheritance() : Base1(), Base2() {}
+      CHAI_HOST_DEVICE virtual ~ClassWithMultipleInheritance() {}
+};
+
+class RawArrayClass {
+   public:
+      CHAI_HOST_DEVICE RawArrayClass() : m_values(nullptr) {}
+      CHAI_HOST_DEVICE RawArrayClass(int* values) : m_values(values) {}
+
+      CHAI_HOST_DEVICE ~RawArrayClass() {}
+
+      CHAI_HOST_DEVICE int getValue(const int i) const { return m_values[i]; }
+
+   private:
+      int* m_values;
+};
+
+class RawPointerClass {
+   public:
+      CHAI_HOST_DEVICE RawPointerClass() : m_innerClass(nullptr) {}
+      CHAI_HOST_DEVICE RawPointerClass(RawArrayClass* innerClass) : m_innerClass(innerClass) {}
+
+      CHAI_HOST_DEVICE ~RawPointerClass() {}
+
+      CHAI_HOST_DEVICE int getValue(const int i) const { return m_innerClass->getValue(i); }
+
+   private:
+      RawArrayClass* m_innerClass;
+};
+
+class TestBase {
+   public:
+      CHAI_HOST_DEVICE TestBase() {}
+      CHAI_HOST_DEVICE virtual ~TestBase() {}
+
+      CHAI_HOST_DEVICE virtual int getValue(const int i) const = 0;
+};
+
+class TestDerived : public TestBase {
+   public:
+      CHAI_HOST_DEVICE TestDerived() : TestBase(), m_values(nullptr) {}
+      CHAI_HOST_DEVICE TestDerived(chai::ManagedArray<int> values) : TestBase(), m_values(values) {}
+      CHAI_HOST_DEVICE virtual ~TestDerived() {}
+
+      CHAI_HOST_DEVICE virtual int getValue(const int i) const { return m_values[i]; }
+
+   private:
+      chai::ManagedArray<int> m_values;
+};
+
+class TestInnerBase {
+   public:
+      CHAI_HOST_DEVICE TestInnerBase() {}
+      CHAI_HOST_DEVICE virtual ~TestInnerBase() {}
+
+      CHAI_HOST_DEVICE virtual int getValue() = 0;
+};
+
+class TestInner : public TestInnerBase {
+   public:
+      CHAI_HOST_DEVICE TestInner() : TestInnerBase(), m_value(0) {}
+      CHAI_HOST_DEVICE TestInner(int value) : TestInnerBase(), m_value(value) {}
+      CHAI_HOST_DEVICE virtual ~TestInner() {}
+
+      CHAI_HOST_DEVICE virtual int getValue() { return m_value; }
+
+   private:
+      int m_value;
+};
+
+class TestContainer {
+   public:
+      CHAI_HOST_DEVICE TestContainer() : m_innerType(nullptr) {}
+      CHAI_HOST_DEVICE TestContainer(chai::managed_ptr<TestInner> innerType) : m_innerType(innerType) {}
+
+      CHAI_HOST_DEVICE ~TestContainer() {}
+
+      CHAI_HOST_DEVICE int getValue() const {
+         return m_innerType->getValue();
+      }
+
+   private:
+      chai::managed_ptr<TestInner> m_innerType;
+};
+
+class MultipleRawArrayClass {
+   public:
+      CHAI_HOST_DEVICE MultipleRawArrayClass() : m_values1(nullptr), m_values2(nullptr) {}
+      CHAI_HOST_DEVICE MultipleRawArrayClass(int* values1, int* values2) :
+         m_values1(values1),
+         m_values2(values2)
+      {}
+
+      CHAI_HOST_DEVICE ~MultipleRawArrayClass() {}
+
+      CHAI_HOST_DEVICE int getValue(const int i, const int j) const {
+         if (i == 0) {
+            return m_values1[j];
+         }
+         else if (i == 1) {
+            return m_values2[j];
+         }
+         else {
+            return -1;
+         }
+      }
+
+   private:
+      int* m_values1;
+      int* m_values2;
+};
+
+TEST(managed_ptr, class_with_raw_array)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+    array[i] = expectedValue;
+  });
+
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+
+  ASSERT_EQ(rawArrayClass->getValue(0), expectedValue);
+
+  array.free();
+}
+
+TEST(managed_ptr, class_with_multiple_raw_arrays)
+{
+  const int expectedValue1 = rand();
+  const int expectedValue2 = rand();
+
+  chai::ManagedArray<int> array1(1, chai::CPU);
+  chai::ManagedArray<int> array2(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array1[i] = expectedValue1;
+     array2[i] = expectedValue2;
+  });
+
+  auto multipleRawArrayClass = chai::make_managed<MultipleRawArrayClass>(array1, array2);
+
+  ASSERT_EQ(multipleRawArrayClass->getValue(0, 0), expectedValue1);
+  ASSERT_EQ(multipleRawArrayClass->getValue(1, 0), expectedValue2);
+}
+
+TEST(managed_ptr, class_with_managed_array)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[i] = expectedValue;
+  });
+
+  auto derived = chai::make_managed<TestDerived>(array);
+
+  ASSERT_EQ(derived->getValue(0), expectedValue);
+}
+
+TEST(managed_ptr, class_with_raw_ptr)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[i] = expectedValue;
+  });
+
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  auto rawPointerClass = chai::make_managed<RawPointerClass>(rawArrayClass);
+
+  // This prevents the pointers contained by rawArrayClass from being deleted
+  // out from under us. Otherwise, rawArrayClass is the last remaining reference
+  // and if it is destroyed before rawPointerClass is, then we are in trouble.
+  rawPointerClass.set_callback([=] (chai::Action, chai::ExecutionSpace, void*) {
+                                  (void) rawArrayClass; return false;
+                               });
+  rawArrayClass = nullptr;
+
+  ASSERT_EQ((*rawPointerClass).getValue(0), expectedValue);
+}
+
+TEST(managed_ptr, class_with_managed_ptr)
+{
+  const int expectedValue = rand();
+
+  auto derived = chai::make_managed<TestInner>(expectedValue);
+  TestContainer container(derived);
+
+  ASSERT_EQ(container.getValue(), expectedValue);
+}
+
+TEST(managed_ptr, nested_managed_ptr)
+{
+  const int expectedValue = rand();
+
+  auto derived = chai::make_managed<TestInner>(expectedValue);
+  auto container = chai::make_managed<TestContainer>(derived);
+
+  ASSERT_EQ(container->getValue(), expectedValue);
+}
+
+#ifdef __CUDACC__
+
+template <typename T>
+__global__ void deviceNew(T** arr) {
+   *arr = new T[5];
+}
+
+template <typename T>
+__global__ void deviceDelete(T** arr) {
+   delete[] *arr;
+}
+
+__global__ void passObjectToKernel(chai::ManagedArray<int> arr) {
+   arr[0] = -1;
+}
+
+CUDA_TEST(managed_ptr, make_on_device)
+{
+  int** hostArray = (int**) malloc(sizeof(int*));
+  hostArray[0] = nullptr;
+
+  int** deviceArray = nullptr;
+  cudaMalloc(&deviceArray, sizeof(int*));
+
+  int** deviceArray2 = nullptr;
+  cudaMalloc(&deviceArray2, sizeof(int*));
+
+  deviceNew<<<1, 1>>>(deviceArray);
+
+  cudaMemcpy(hostArray, deviceArray, sizeof(int*), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  cudaMemcpy(deviceArray2, hostArray, sizeof(int*), cudaMemcpyHostToDevice);
+  ASSERT_NE(hostArray[0], nullptr);
+
+  deviceDelete<<<1, 1>>>(deviceArray2);
+  cudaDeviceSynchronize();
+  free(hostArray);
+  cudaFree(deviceArray);
+  cudaFree(deviceArray2);
+}
+
+CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
+{
+  // Initialize host side memory to hold a pointer
+  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  RawArrayClass** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
+
+  // Create on the device
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  RawArrayClass* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
+
+  chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
+}
+
+CUDA_TEST(managed_ptr, cuda_build_managed_ptr)
+{
+  // Initialize host side memory to hold a pointer
+  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  RawArrayClass** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
+
+  // Create on the device
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  RawArrayClass* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
+
+  chai::managed_ptr<RawArrayClass> managedPtr({chai::GPU}, {gpuPointer});
+}
+
+
+CUDA_TEST(managed_ptr, pass_object_to_kernel)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[i] = expectedValue;
+  });
+
+  chai::ArrayManager* manager = chai::ArrayManager::getInstance();
+  manager->setExecutionSpace(chai::GPU);
+  passObjectToKernel<<<1, 1>>>(array);
+  cudaDeviceSynchronize();
+  array.move(chai::CPU);
+  cudaDeviceSynchronize();
+  ASSERT_EQ(array[0], -1);
+}
+
+CUDA_TEST(managed_ptr, cuda_class_with_raw_array)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[i] = expectedValue;
+  });
+
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  chai::ManagedArray<int> results(1, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = rawArrayClass->getValue(i);
+  });
+
+  results.move(chai::CPU);
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_class_with_raw_array_and_callback)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[i] = expectedValue;
+  });
+
+  auto cpuPointer = new RawArrayClass(array);
+  auto gpuPointer = chai::detail::make_on_device<RawArrayClass>(array);
+
+  auto callback = [=] (chai::Action action, chai::ExecutionSpace space, void*) mutable -> bool {
+     switch (action) {
+        case chai::ACTION_FREE:
+           switch (space) {
+              case chai::NONE:
+                 array.free();
+                 return true;
+              default:
+                 return false;
+           }
+        default:
+           return false;
+     }
+  };
+
+  auto managedPointer = chai::managed_ptr<RawArrayClass>({chai::CPU, chai::GPU},
+                                                         {cpuPointer, gpuPointer},
+                                                         callback);
+
+  chai::ManagedArray<int> results(1, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = managedPointer->getValue(i);
+  });
+
+  results.move(chai::CPU);
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_class_with_managed_array)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  chai::managed_ptr<TestBase> derived = chai::make_managed<TestDerived>(array);
+
+  chai::ManagedArray<int> results(1, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = derived->getValue(i);
+  });
+
+  results.move(chai::CPU);
+
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_class_with_raw_ptr)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  auto rawPointerClass = chai::make_managed<RawPointerClass>(rawArrayClass);
+
+  chai::ManagedArray<int> results(1, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = (*rawPointerClass).getValue(i);
+  });
+
+  results.move(chai::CPU);
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_class_with_managed_ptr)
+{
+  const int expectedValue = rand();
+
+  auto derived = chai::make_managed<TestInner>(expectedValue);
+  TestContainer container(derived);
+
+  chai::ManagedArray<int> results(1, chai::GPU);
+  
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = container.getValue();
+  });
+
+  results.move(chai::CPU);
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_nested_managed_ptr)
+{
+  const int expectedValue = rand();
+
+  auto derived = chai::make_managed<TestInner>(expectedValue);
+  auto container = chai::make_managed<TestContainer>(derived);
+
+  chai::ManagedArray<int> results(1, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = container->getValue();
+  });
+
+  results.move(chai::CPU);
+  ASSERT_EQ(results[0], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, cuda_multiple_inheritance)
+{
+  auto derived = chai::make_managed<ClassWithMultipleInheritance>();
+
+  chai::managed_ptr<Base1> base1 = derived;
+  chai::managed_ptr<Base2> base2 = derived;
+
+  chai::ManagedArray<bool> results(2, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = base1->isBase1();
+    results[1] = base2->isBase2();
+  });
+
+  results.move(chai::CPU);
+  cudaDeviceSynchronize();
+
+  ASSERT_EQ(results[0], true);
+  ASSERT_EQ(results[1], true);
+}
+
+CUDA_TEST(managed_ptr, static_pointer_cast)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  auto derived = chai::make_managed<TestDerived>(array);
+  auto base = chai::static_pointer_cast<TestBase>(derived);
+  auto derivedFromBase = chai::static_pointer_cast<TestDerived>(base);
+
+  chai::ManagedArray<int> results(3, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = derived->getValue(i);
+    results[1] = base->getValue(i);
+    results[2] = derivedFromBase->getValue(i);
+  });
+
+  results.move(chai::CPU);
+
+  ASSERT_EQ(results[0], expectedValue);
+  ASSERT_EQ(results[1], expectedValue);
+  ASSERT_EQ(results[2], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, dynamic_pointer_cast)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  auto derived = chai::make_managed<TestDerived>(array);
+  auto base = chai::dynamic_pointer_cast<TestBase>(derived);
+  auto derivedFromBase = chai::dynamic_pointer_cast<TestDerived>(base);
+
+  chai::ManagedArray<int> results(3, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = derived->getValue(i);
+    results[1] = base->getValue(i);
+    results[2] = derivedFromBase->getValue(i);
+  });
+
+  results.move(chai::CPU);
+
+  ASSERT_EQ(results[0], expectedValue);
+  ASSERT_EQ(results[1], expectedValue);
+  ASSERT_EQ(results[2], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, const_pointer_cast)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  auto derived = chai::make_managed<TestDerived>(array);
+  auto constDerived = chai::const_pointer_cast<const TestDerived>(derived);
+  auto derivedFromConst = chai::const_pointer_cast<TestDerived>(constDerived);
+
+  chai::ManagedArray<int> results(3, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = derived->getValue(i);
+    results[1] = constDerived->getValue(i);
+    results[2] = derivedFromConst->getValue(i);
+  });
+
+  results.move(chai::CPU);
+
+  ASSERT_EQ(results[0], expectedValue);
+  ASSERT_EQ(results[1], expectedValue);
+  ASSERT_EQ(results[2], expectedValue);
+}
+
+CUDA_TEST(managed_ptr, reinterpret_pointer_cast)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+
+  forall(sequential(), 0, 1, [=] (int i) {
+     array[0] = expectedValue;
+  });
+
+  auto derived = chai::make_managed<TestDerived>(array);
+  auto base = chai::reinterpret_pointer_cast<TestBase>(derived);
+  auto derivedFromBase = chai::reinterpret_pointer_cast<TestDerived>(base);
+
+  chai::ManagedArray<int> results(3, chai::GPU);
+
+  forall(cuda(), 0, 1, [=] __device__ (int i) {
+    results[i] = derived->getValue(i);
+    results[1] = base->getValue(i);
+    results[2] = derivedFromBase->getValue(i);
+  });
+
+  results.move(chai::CPU);
+
+  ASSERT_EQ(results[0], expectedValue);
+  ASSERT_EQ(results[1], expectedValue);
+  ASSERT_EQ(results[2], expectedValue);
+}
+
+#endif
+
+#if 0 // TODO: Enable if/when ManagedArrays of managed_ptrs can be handled correctly.
+
+class RawArrayOfPointersClass {
+   public:
+      CHAI_HOST_DEVICE RawArrayOfPointersClass() = delete;
+      CHAI_HOST_DEVICE RawArrayOfPointersClass(RawArrayClass** arrayOfPointers) :
+         m_arrayOfPointers(arrayOfPointers)
+      {}
+
+      CHAI_HOST_DEVICE int getValue(const int i, const int j) const {
+         return m_arrayOfPointers[i]->getValue(j);
+      }
+
+   private:
+      RawArrayClass** m_arrayOfPointers = nullptr;
+};
+
+TEST(managed_ptr, class_with_raw_array_of_pointers)
+{
+  const int expectedValue = rand();
+
+  chai::ManagedArray<int> array(1, chai::CPU);
+  array[0] = expectedValue;
+
+  auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
+  chai::managed_ptr<RawArrayClass> arrayOfPointers[1] = {rawArrayClass};
+
+  auto rawArrayOfPointersClass = chai::make_managed<RawArrayOfPointersClass>(arrayOfPointers);
+  ASSERT_EQ(rawArrayOfPointersClass->getValue(0, 0), expectedValue);
+}
+
+#endif
+

From 9ec6d1a4efa69b98101d2b574944a3be43ff17fd Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Mon, 30 Sep 2019 17:33:15 -0700
Subject: [PATCH 24/58] Restore hip support to unit tests

---
 tests/unit/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 835aded5..b4103bc8 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -46,6 +46,7 @@ set (chai_unit_test_depends
      chai umpire gtest)
 
 blt_list_append(TO chai_unit_test_depends ELEMENTS cuda IF ${ENABLE_CUDA})
+blt_list_append(TO chai_unit_test_depends ELEMENTS hip IF ${ENABLE_HIP})
 
 # ManagedArray tests
 blt_add_executable(

From 052973b106ecaf90a77bc81f683dbb69a92ff1b0 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 14:02:58 -0700
Subject: [PATCH 25/58] Remove reference counting from managed_ptr

---
 src/chai/managed_ptr.hpp              | 271 +++++++++-----------------
 tests/unit/managed_ptr_unit_tests.cpp |  67 +++----
 2 files changed, 124 insertions(+), 214 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 36c25d95..c9c0de66 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -95,29 +95,15 @@ namespace chai {
 
    struct managed_ptr_record {
       managed_ptr_record() :
-         m_num_references(1),
          m_callback()
       {
       }
 
       managed_ptr_record(std::function<bool(Action, ExecutionSpace, void*)> callback) :
-         m_num_references(1),
          m_callback(callback)
       {
       }
 
-      size_t use_count() {
-         return m_num_references;
-      }
-
-      void addReference() {
-         m_num_references++;
-      }
-
-      void removeReference() {
-         m_num_references--;
-      }
-
       ExecutionSpace getLastSpace() {
          return m_last_space;
       }
@@ -126,7 +112,6 @@ namespace chai {
          m_callback = callback;
       }
 
-      size_t m_num_references = 1; /// The reference counter
       ExecutionSpace m_last_space = NONE; /// The last space executed in
       std::function<bool(Action, ExecutionSpace, void*)> m_callback; /// Callback to handle events
    };
@@ -136,10 +121,7 @@ namespace chai {
    /// @author Alan Dayton
    ///
    /// This wrapper stores both host and device pointers so that polymorphism can be
-   ///    used in both contexts with a single API. It is modeled after std::shared_ptr,
-   ///    so it does reference counting and automatically cleans up when the last
-   ///    reference is destroyed. If we ever do multi-threading on the CPU, locking will
-   ///    need to be added to the reference counter.
+   ///    used in both contexts with a single API.
    /// The make_managed and make_managed_from_factory functions call new on both the
    ///    host and device so that polymorphism is valid in both contexts. Simply copying
    ///    an object to the device will not copy the vtable, so new must be called on
@@ -173,11 +155,10 @@ namespace chai {
    ///       be given the extracted host pointer, and likewise the device constructor
    ///       of T will be given the extracted device pointer. It is recommended that
    ///       a callback is defined that maintains a copy of the managed_ptr so that
-   ///       the raw pointers are not accidentally destroyed prematurely (since
-   ///       managed_ptr does reference counting). It is also recommended that the
-   ///       callback calls the copy constructor of the managed_ptr on the ACTION_MOVE
-   ///       event so that the ACTION_MOVE event is triggered also for the inner
-   ///       managed_ptr.
+   ///       the raw pointers are not accidentally destroyed prematurely. It is also
+   ///       recommended that the callback calls the copy constructor of the managed_ptr
+   ///       on the ACTION_MOVE event so that the ACTION_MOVE event is triggered also for
+   ///       the inner managed_ptr.
    ///    Again, if a raw pointer is passed to make_managed, accessing that member will
    ///       only be valid in the correct context. Take care when passing raw pointers
    ///       as arguments to member functions.
@@ -199,7 +180,6 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Default constructor.
-         /// Initializes the reference count to 0.
          ///
          CHAI_HOST_DEVICE constexpr managed_ptr() noexcept {}
 
@@ -207,7 +187,6 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Construct from nullptr.
-         /// Initializes the reference count to 0.
          ///
          CHAI_HOST_DEVICE constexpr managed_ptr(std::nullptr_t) noexcept {}
 
@@ -309,9 +288,9 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Copy constructor.
-         /// Constructs a copy of the given managed_ptr, increases the reference count,
-         ///    and if the execution space is different, calls the user defined callback
-         ///    with ACTION_MOVE for each of the execution spaces.
+         /// Constructs a copy of the given managed_ptr and if the execution space is
+         ///    different, calls the user defined callback with ACTION_MOVE for each
+         ///    of the execution spaces.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -321,7 +300,6 @@ namespace chai {
             m_pointer_record(other.m_pointer_record)
          {
 #ifndef __CUDA_ARCH__
-            addReference();
             move();
 #endif
          }
@@ -330,10 +308,9 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Converting constructor.
-         /// Constructs a copy of the given managed_ptr, increases the reference count,
-         ///    and if the execution space is different, calls the user defined callback
-         ///    with ACTION_MOVE for each of the execution spaces. U* must be convertible
-         ///    to T*.
+         /// Constructs a copy of the given managed_ptr and if the execution space is
+         ///    different, calls the user defined callback with ACTION_MOVE for each
+         ///    of the execution spaces. U* must be convertible to T*.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -347,7 +324,6 @@ namespace chai {
                           "U* must be convertible to T*.");
 
 #ifndef __CUDA_ARCH__
-            addReference();
             move();
 #endif
          }
@@ -394,57 +370,30 @@ namespace chai {
                }
             }
 
-            addReference();
             move();
          }
 
          ///
          /// @author Alan Dayton
          ///
-         /// Destructor. Decreases the reference count and if this is the last reference,
-         ///    clean up.
+         /// Destructor
          ///
-         CHAI_HOST_DEVICE ~managed_ptr() {
-#ifdef __CUDACC__
-            // This trick came from Max Katz at Nvidia.
-            // Taking the address of this kernel ensures that it gets instantiated
-            // by the compiler and can be used within __CUDA_ARCH__. Without this,
-            // calling destroy_on_device within the confines of __CUDA_ARCH__ will
-            // always fail with error code 0x8 (invalid device function).
-            // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#restrictions
-            // From the CUDA Programming Guide Restrictions:
-            // "If a __global__ function template is instantiated and launched from
-            // the host, then the function template must be instantiated with the
-            // same template arguments irrespective of whether __CUDA_ARCH__ is
-            // defined and regardless of the value of __CUDA_ARCH__."
-            (void) &detail::destroy_on_device<T>;
-#endif
-
-#ifndef __CUDA_ARCH__
-            removeReference();
-#endif
-         }
+         CHAI_HOST_DEVICE ~managed_ptr() {}
 
          ///
          /// @author Alan Dayton
          ///
-         /// Copy assignment operator.
-         /// Copies the given managed_ptr and increases the reference count.
+         /// Copy assignment operator. Does a shallow copy.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
          CHAI_HOST_DEVICE managed_ptr& operator=(const managed_ptr& other) noexcept {
             if (this != &other) {
-#ifndef __CUDA_ARCH__
-               removeReference();
-#endif
-
                m_cpu_pointer = other.m_cpu_pointer;
                m_gpu_pointer = other.m_gpu_pointer;
                m_pointer_record = other.m_pointer_record;
 
 #ifndef __CUDA_ARCH__
-               addReference();
                move();
 #endif
             }
@@ -456,8 +405,8 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Conversion copy assignment operator.
-         /// Copies the given managed_ptr and increases the reference count.
-         ///    U* must be convertible to T*.
+         /// Copies the given managed_ptr. Does a shallow copy. U* must be convertible
+         ///    to T*.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -466,16 +415,11 @@ namespace chai {
             static_assert(std::is_convertible<U*, T*>::value,
                           "U* must be convertible to T*.");
 
-#ifndef __CUDA_ARCH__
-            removeReference();
-#endif
-
             m_cpu_pointer = other.m_cpu_pointer;
             m_gpu_pointer = other.m_gpu_pointer;
             m_pointer_record = other.m_pointer_record;
 
 #ifndef __CUDA_ARCH__
-            addReference();
             move();
 #endif
 
@@ -547,20 +491,6 @@ namespace chai {
 #endif
          }
 
-         ///
-         /// @author Alan Dayton
-         ///
-         /// Returns the number of managed_ptrs owning these pointers.
-         ///
-         CHAI_HOST std::size_t use_count() const {
-            if (m_pointer_record) {
-               return m_pointer_record->use_count();
-            }
-            else {
-               return 0;
-            }
-         }
-
          ///
          /// @author Alan Dayton
          ///
@@ -590,6 +520,82 @@ namespace chai {
             }
          }
 
+         ///
+         /// @author Alan Dayton
+         ///
+         /// If a user callback is provided, calls the callback with the ACTION_FREE
+         ///    event. Otherwise calls delete on the CPU and GPU pointers.
+         ///
+         CHAI_HOST void free() {
+            if (m_pointer_record) {
+               if (m_pointer_record->m_callback) {
+                  // Destroy device pointer first to take advantage of asynchrony
+                  for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
+                     ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
+                     T* pointer = get(execSpace, false);
+
+                     using T_non_const = typename std::remove_const<T>::type;
+
+                     // We can use const_cast because can managed_ptr can only
+                     // be constructed with non const pointers.
+                     T_non_const* temp = const_cast<T_non_const*>(pointer);
+                     void* voidPointer = static_cast<void*>(temp);
+
+                     if (!m_pointer_record->m_callback(ACTION_FREE,
+                                                       execSpace,
+                                                       voidPointer)) {
+                        switch (execSpace) {
+                           case CPU:
+                              delete pointer;
+                              break;
+#ifdef __CUDACC__
+                           case GPU:
+                           {
+                              if (pointer) {
+                                 detail::destroy_on_device<<<1, 1>>>(temp);
+                                 debug_cudaDeviceSynchronize();
+                              }
+
+                              break;
+                           }
+#endif
+                           default:
+                              break;
+                        }
+                     }
+                  }
+               }
+               else {
+                  // Destroy device pointer first to take advantage of asynchrony
+                  for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
+                     ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
+                     T* pointer = get(execSpace, false);
+
+                     switch (execSpace) {
+                        case CPU:
+                           delete pointer;
+                           break;
+#ifdef __CUDACC__
+                        case GPU:
+                        {
+                           if (pointer) {
+                              detail::destroy_on_device<<<1, 1>>>(pointer);
+                              debug_cudaDeviceSynchronize();
+                           }
+
+                           break;
+                        }
+#endif
+                        default:
+                           break;
+                     }
+                  }
+               }
+
+               delete m_pointer_record;
+            }
+         }
+
       private:
          T* m_cpu_pointer = nullptr; /// The CPU pointer
          T* m_gpu_pointer = nullptr; /// The GPU pointer
@@ -639,99 +645,6 @@ namespace chai {
             }
 #endif
          }
-
-         ///
-         /// @author Alan Dayton
-         ///
-         /// Increments the reference count and calls the copy constructor to
-         ///    trigger data movement.
-         ///
-         CHAI_HOST void addReference() {
-            if (m_pointer_record) {
-               m_pointer_record->addReference();
-            }
-         }
-
-         ///
-         /// @author Alan Dayton
-         ///
-         /// Decrements the reference counter. If the resulting number of references
-         ///    is 0, clean up the object.
-         ///
-         CHAI_HOST void removeReference() {
-            if (m_pointer_record) {
-               m_pointer_record->removeReference();
-
-               if (m_pointer_record->use_count() == 0) {
-                  if (m_pointer_record->m_callback) {
-                     // Destroy device pointer first to take advantage of asynchrony
-                     for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
-                        ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
-                        T* pointer = get(execSpace, false);
-
-                        using T_non_const = typename std::remove_const<T>::type;
-
-                        // We can use const_cast because can managed_ptr can only
-                        // be constructed with non const pointers.
-                        T_non_const* temp = const_cast<T_non_const*>(pointer);
-                        void* voidPointer = static_cast<void*>(temp);
-
-                        if (!m_pointer_record->m_callback(ACTION_FREE,
-                                                          execSpace,
-                                                          voidPointer)) {
-                           switch (execSpace) {
-                              case CPU:
-                                 delete pointer;
-                                 break;
-#ifdef __CUDACC__
-                              case GPU:
-                              {
-                                 if (pointer) {
-                                    detail::destroy_on_device<<<1, 1>>>(temp);
-                                    debug_cudaDeviceSynchronize();
-                                 }
-
-                                 break;
-                              }
-#endif
-                              default:
-                                 break;
-                           }
-                        }
-                     }
-                  }
-                  else {
-                     // Destroy device pointer first to take advantage of asynchrony
-                     for (int space = NUM_EXECUTION_SPACES-1; space >= NONE; --space) {
-                        ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
-                        T* pointer = get(execSpace, false);
-
-                        switch (execSpace) {
-                           case CPU:
-                              delete pointer;
-                              break;
-#ifdef __CUDACC__
-                           case GPU:
-                           {
-                              if (pointer) {
-                                 detail::destroy_on_device<<<1, 1>>>(pointer);
-                                 debug_cudaDeviceSynchronize();
-                              }
-
-                              break;
-                           }
-#endif
-                           default:
-                              break;
-                        }
-                     }
-                  }
-
-                  delete m_pointer_record;
-               }
-            }
-         }
-
    };
 
    namespace detail {
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index c6abf86e..4bef0cef 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -131,7 +131,6 @@ TEST(managed_ptr, default_constructor)
   chai::managed_ptr<TestDerived> otherDerived;
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_FALSE(derived);
   EXPECT_TRUE(derived == nullptr);
   EXPECT_TRUE(nullptr == derived);
@@ -141,6 +140,10 @@ TEST(managed_ptr, default_constructor)
   EXPECT_TRUE(otherDerived == derived);
   EXPECT_FALSE(derived != otherDerived);
   EXPECT_FALSE(otherDerived != derived);
+
+  // Make sure free is a no-op
+  derived.free();
+  otherDerived.free();
 }
 
 TEST(managed_ptr, nullptr_constructor)
@@ -149,7 +152,6 @@ TEST(managed_ptr, nullptr_constructor)
   chai::managed_ptr<TestDerived> otherDerived = nullptr;
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_FALSE(derived);
   EXPECT_TRUE(derived == nullptr);
   EXPECT_TRUE(nullptr == derived);
@@ -159,6 +161,10 @@ TEST(managed_ptr, nullptr_constructor)
   EXPECT_TRUE(otherDerived == derived);
   EXPECT_FALSE(derived != otherDerived);
   EXPECT_FALSE(otherDerived != derived);
+
+  // Make sure free is a no-op
+  derived.free();
+  otherDerived.free();
 }
 
 TEST(managed_ptr, cpu_pointer_constructor)
@@ -169,12 +175,13 @@ TEST(managed_ptr, cpu_pointer_constructor)
   EXPECT_EQ(derived->getValue(), 3);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, make_managed)
@@ -185,12 +192,13 @@ TEST(managed_ptr, make_managed)
   EXPECT_EQ((*derived).getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, copy_constructor)
@@ -203,7 +211,6 @@ TEST(managed_ptr, copy_constructor)
   EXPECT_EQ(otherDerived->getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -213,7 +220,6 @@ TEST(managed_ptr, copy_constructor)
   EXPECT_FALSE(derived != otherDerived);
 
   EXPECT_NE(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 2);
   EXPECT_TRUE(otherDerived);
   EXPECT_FALSE(otherDerived == nullptr);
   EXPECT_FALSE(nullptr == otherDerived);
@@ -221,6 +227,8 @@ TEST(managed_ptr, copy_constructor)
   EXPECT_TRUE(nullptr != otherDerived);
   EXPECT_TRUE(otherDerived == derived);
   EXPECT_FALSE(otherDerived != derived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, converting_constructor)
@@ -233,7 +241,6 @@ TEST(managed_ptr, converting_constructor)
   EXPECT_EQ(base->getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -243,7 +250,6 @@ TEST(managed_ptr, converting_constructor)
   EXPECT_FALSE(derived != base);
 
   EXPECT_NE(base.get(), nullptr);
-  EXPECT_EQ(base.use_count(), 2);
   EXPECT_TRUE(base);
   EXPECT_FALSE(base == nullptr);
   EXPECT_FALSE(nullptr == base);
@@ -251,6 +257,8 @@ TEST(managed_ptr, converting_constructor)
   EXPECT_TRUE(nullptr != base);
   EXPECT_TRUE(base == derived);
   EXPECT_FALSE(base != derived);
+
+  base.free();
 }
 
 TEST(managed_ptr, copy_assignment_operator)
@@ -264,7 +272,6 @@ TEST(managed_ptr, copy_assignment_operator)
   EXPECT_EQ(otherDerived->getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -274,7 +281,6 @@ TEST(managed_ptr, copy_assignment_operator)
   EXPECT_FALSE(derived != otherDerived);
 
   EXPECT_NE(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 2);
   EXPECT_TRUE(otherDerived);
   EXPECT_FALSE(otherDerived == nullptr);
   EXPECT_FALSE(nullptr == otherDerived);
@@ -282,6 +288,8 @@ TEST(managed_ptr, copy_assignment_operator)
   EXPECT_TRUE(nullptr != otherDerived);
   EXPECT_TRUE(otherDerived == derived);
   EXPECT_FALSE(otherDerived != derived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, copy_constructor_from_default_constructed)
@@ -290,13 +298,11 @@ TEST(managed_ptr, copy_constructor_from_default_constructed)
   chai::managed_ptr<TestDerived> otherDerived(derived);
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_EQ(bool(derived), false);
   EXPECT_EQ(derived, nullptr);
   EXPECT_EQ(nullptr, derived);
 
   EXPECT_EQ(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 0);
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
@@ -309,13 +315,11 @@ TEST(managed_ptr, copy_assignment_operator_from_default_constructed)
   otherDerived = derived;
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_EQ(bool(derived), false);
   EXPECT_EQ(derived, nullptr);
   EXPECT_EQ(nullptr, derived);
 
   EXPECT_EQ(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 0);
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
@@ -327,13 +331,11 @@ TEST(managed_ptr, conversion_copy_constructor_from_default_constructed)
   chai::managed_ptr<TestBase> otherDerived(derived);
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_EQ(bool(derived), false);
   EXPECT_EQ(derived, nullptr);
   EXPECT_EQ(nullptr, derived);
 
   EXPECT_EQ(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 0);
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
@@ -346,13 +348,11 @@ TEST(managed_ptr, conversion_copy_assignment_operator_from_default_constructed)
   otherDerived = derived;
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 0);
   EXPECT_EQ(bool(derived), false);
   EXPECT_EQ(derived, nullptr);
   EXPECT_EQ(nullptr, derived);
 
   EXPECT_EQ(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 0);
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
@@ -370,22 +370,22 @@ TEST(managed_ptr, copy_assignment_operator_from_host_ptr_constructed)
   thirdDerived = derived;
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_EQ(bool(derived), true);
   EXPECT_NE(derived, nullptr);
   EXPECT_NE(nullptr, derived);
 
   EXPECT_NE(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 1);
   EXPECT_EQ(bool(otherDerived), true);
   EXPECT_NE(otherDerived, nullptr);
   EXPECT_NE(nullptr, otherDerived);
 
   EXPECT_NE(thirdDerived.get(), nullptr);
-  EXPECT_EQ(thirdDerived.use_count(), 2);
   EXPECT_EQ(bool(thirdDerived), true);
   EXPECT_NE(thirdDerived, nullptr);
   EXPECT_NE(nullptr, thirdDerived);
+
+  otherDerived.free();
+  thirdDerived.free();
 }
 
 TEST(managed_ptr, conversion_copy_assignment_operator_from_host_ptr_constructed)
@@ -400,22 +400,22 @@ TEST(managed_ptr, conversion_copy_assignment_operator_from_host_ptr_constructed)
   thirdDerived = derived;
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_EQ(bool(derived), true);
   EXPECT_NE(derived, nullptr);
   EXPECT_NE(nullptr, derived);
 
   EXPECT_NE(otherDerived.get(), nullptr);
-  EXPECT_EQ(otherDerived.use_count(), 1);
   EXPECT_EQ(bool(otherDerived), true);
   EXPECT_NE(otherDerived, nullptr);
   EXPECT_NE(nullptr, otherDerived);
 
   EXPECT_NE(thirdDerived.get(), nullptr);
-  EXPECT_EQ(thirdDerived.use_count(), 2);
   EXPECT_EQ(bool(thirdDerived), true);
   EXPECT_NE(thirdDerived, nullptr);
   EXPECT_NE(nullptr, thirdDerived);
+
+  otherDerived.free();
+  thirdDerived.free();
 }
 
 TEST(managed_ptr, static_pointer_cast)
@@ -428,12 +428,13 @@ TEST(managed_ptr, static_pointer_cast)
   EXPECT_EQ(base->getValue(), 3);
 
   EXPECT_NE(base.get(), nullptr);
-  EXPECT_EQ(base.use_count(), 2);
   EXPECT_TRUE(base);
   EXPECT_FALSE(base == nullptr);
   EXPECT_FALSE(nullptr == base);
   EXPECT_TRUE(base != nullptr);
   EXPECT_TRUE(nullptr != base);
+
+  derived.free();
 }
 
 TEST(managed_ptr, dynamic_pointer_cast)
@@ -446,12 +447,13 @@ TEST(managed_ptr, dynamic_pointer_cast)
   EXPECT_EQ(derived->getValue(), 3);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, const_pointer_cast)
@@ -464,12 +466,13 @@ TEST(managed_ptr, const_pointer_cast)
   EXPECT_EQ(nonConstBase->getValue(), 3);
 
   EXPECT_NE(nonConstBase.get(), nullptr);
-  EXPECT_EQ(nonConstBase.use_count(), 2);
   EXPECT_TRUE(nonConstBase);
   EXPECT_FALSE(nonConstBase == nullptr);
   EXPECT_FALSE(nullptr == nonConstBase);
   EXPECT_TRUE(nonConstBase != nullptr);
   EXPECT_TRUE(nullptr != nonConstBase);
+
+  base.free();
 }
 
 TEST(managed_ptr, reinterpret_pointer_cast)
@@ -482,12 +485,13 @@ TEST(managed_ptr, reinterpret_pointer_cast)
   EXPECT_EQ(derived->getValue(), 3);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 2);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 #ifdef __CUDACC__
@@ -570,7 +574,6 @@ CUDA_TEST(managed_ptr, cuda_gpu_pointer_constructor)
   chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
 
   EXPECT_EQ(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_FALSE(derived);
   EXPECT_TRUE(derived == nullptr);
   EXPECT_TRUE(nullptr == derived);
@@ -711,7 +714,6 @@ CUDA_TEST(managed_ptr, cuda_cpu_and_gpu_pointer_constructor)
 
   EXPECT_EQ(derived->getValue(), 4);
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -771,7 +773,6 @@ CUDA_TEST(managed_ptr, cuda_make_managed)
   EXPECT_EQ(array[0], expectedValue);
 
   EXPECT_NE(array2[0], nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(array3[0]);
   EXPECT_FALSE(array3[1]);
   EXPECT_FALSE(array3[2]);
@@ -792,7 +793,6 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_function)
   EXPECT_EQ((*derived).getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -813,7 +813,6 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_lambda)
   EXPECT_EQ((*derived).getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -834,7 +833,6 @@ CUDA_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
   EXPECT_EQ((*derived).getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);
@@ -855,7 +853,6 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_static_member_function)
   EXPECT_EQ((*derived).getValue(), expectedValue);
 
   EXPECT_NE(derived.get(), nullptr);
-  EXPECT_EQ(derived.use_count(), 1);
   EXPECT_TRUE(derived);
   EXPECT_FALSE(derived == nullptr);
   EXPECT_FALSE(nullptr == derived);

From f08d93759992acf65ee2db48c28a90d49d160435 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 14:13:21 -0700
Subject: [PATCH 26/58] Fixed leaks in managed_ptr integration tests

---
 tests/integration/managed_ptr_tests.cpp | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index f5d22680..33435a1f 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -201,6 +201,7 @@ TEST(managed_ptr, class_with_raw_array)
   ASSERT_EQ(rawArrayClass->getValue(0), expectedValue);
 
   array.free();
+  rawArrayClass.free();
 }
 
 TEST(managed_ptr, class_with_multiple_raw_arrays)
@@ -220,6 +221,10 @@ TEST(managed_ptr, class_with_multiple_raw_arrays)
 
   ASSERT_EQ(multipleRawArrayClass->getValue(0, 0), expectedValue1);
   ASSERT_EQ(multipleRawArrayClass->getValue(1, 0), expectedValue2);
+
+  array1.free();
+  array2.free();
+  multipleRawArrayClass.free();
 }
 
 TEST(managed_ptr, class_with_managed_array)
@@ -235,6 +240,9 @@ TEST(managed_ptr, class_with_managed_array)
   auto derived = chai::make_managed<TestDerived>(array);
 
   ASSERT_EQ(derived->getValue(0), expectedValue);
+
+  array.free();
+  derived.free();
 }
 
 TEST(managed_ptr, class_with_raw_ptr)
@@ -250,15 +258,11 @@ TEST(managed_ptr, class_with_raw_ptr)
   auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
   auto rawPointerClass = chai::make_managed<RawPointerClass>(rawArrayClass);
 
-  // This prevents the pointers contained by rawArrayClass from being deleted
-  // out from under us. Otherwise, rawArrayClass is the last remaining reference
-  // and if it is destroyed before rawPointerClass is, then we are in trouble.
-  rawPointerClass.set_callback([=] (chai::Action, chai::ExecutionSpace, void*) {
-                                  (void) rawArrayClass; return false;
-                               });
-  rawArrayClass = nullptr;
-
   ASSERT_EQ((*rawPointerClass).getValue(0), expectedValue);
+
+  array.free();
+  rawArrayClass.free();
+  rawPointerClass.free();
 }
 
 TEST(managed_ptr, class_with_managed_ptr)
@@ -269,6 +273,8 @@ TEST(managed_ptr, class_with_managed_ptr)
   TestContainer container(derived);
 
   ASSERT_EQ(container.getValue(), expectedValue);
+
+  derived.free();
 }
 
 TEST(managed_ptr, nested_managed_ptr)
@@ -279,6 +285,9 @@ TEST(managed_ptr, nested_managed_ptr)
   auto container = chai::make_managed<TestContainer>(derived);
 
   ASSERT_EQ(container->getValue(), expectedValue);
+
+  derived.free();
+  container.free();
 }
 
 #ifdef __CUDACC__

From 05b0027f582309147e41dcd84ed0ba16dd8add13 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 14:25:37 -0700
Subject: [PATCH 27/58] Fix cuda compile errors

---
 tests/integration/managed_ptr_tests.cpp | 60 ++++++++++++-------------
 tests/unit/managed_ptr_unit_tests.cpp   | 56 +++++++++++------------
 2 files changed, 58 insertions(+), 58 deletions(-)

diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index 33435a1f..94982121 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -42,10 +42,10 @@
 // ---------------------------------------------------------------------
 #include "gtest/gtest.h"
 
-#define CUDA_TEST(X, Y)              \
-  static void cuda_test_##X##Y();    \
-  TEST(X, Y) { cuda_test_##X##Y(); } \
-  static void cuda_test_##X##Y()
+#define GPU_TEST(X, Y)              \
+  static void gpu_test_##X##Y();    \
+  TEST(X, Y) { gpu_test_##X##Y(); } \
+  static void gpu_test_##X##Y()
 
 #include "chai/config.hpp"
 #include "chai/ManagedArray.hpp"
@@ -306,7 +306,7 @@ __global__ void passObjectToKernel(chai::ManagedArray<int> arr) {
    arr[0] = -1;
 }
 
-CUDA_TEST(managed_ptr, make_on_device)
+GPU_TEST(managed_ptr, make_on_device)
 {
   int** hostArray = (int**) malloc(sizeof(int*));
   hostArray[0] = nullptr;
@@ -331,7 +331,7 @@ CUDA_TEST(managed_ptr, make_on_device)
   cudaFree(deviceArray2);
 }
 
-CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
+GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
   // Initialize host side memory to hold a pointer
   RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
@@ -360,7 +360,7 @@ CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
   chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
-CUDA_TEST(managed_ptr, cuda_build_managed_ptr)
+GPU_TEST(managed_ptr, gpu_build_managed_ptr)
 {
   // Initialize host side memory to hold a pointer
   RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
@@ -390,7 +390,7 @@ CUDA_TEST(managed_ptr, cuda_build_managed_ptr)
 }
 
 
-CUDA_TEST(managed_ptr, pass_object_to_kernel)
+GPU_TEST(managed_ptr, pass_object_to_kernel)
 {
   const int expectedValue = rand();
 
@@ -409,7 +409,7 @@ CUDA_TEST(managed_ptr, pass_object_to_kernel)
   ASSERT_EQ(array[0], -1);
 }
 
-CUDA_TEST(managed_ptr, cuda_class_with_raw_array)
+GPU_TEST(managed_ptr, gpu_class_with_raw_array)
 {
   const int expectedValue = rand();
 
@@ -422,7 +422,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_array)
   auto rawArrayClass = chai::make_managed<RawArrayClass>(array);
   chai::ManagedArray<int> results(1, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = rawArrayClass->getValue(i);
   });
 
@@ -430,7 +430,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_array)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_class_with_raw_array_and_callback)
+GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback)
 {
   const int expectedValue = rand();
 
@@ -464,7 +464,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_array_and_callback)
 
   chai::ManagedArray<int> results(1, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = managedPointer->getValue(i);
   });
 
@@ -472,7 +472,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_array_and_callback)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_class_with_managed_array)
+GPU_TEST(managed_ptr, gpu_class_with_managed_array)
 {
   const int expectedValue = rand();
 
@@ -486,7 +486,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_managed_array)
 
   chai::ManagedArray<int> results(1, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = derived->getValue(i);
   });
 
@@ -495,7 +495,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_managed_array)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_class_with_raw_ptr)
+GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
 {
   const int expectedValue = rand();
 
@@ -510,7 +510,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_ptr)
 
   chai::ManagedArray<int> results(1, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = (*rawPointerClass).getValue(i);
   });
 
@@ -518,7 +518,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_raw_ptr)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_class_with_managed_ptr)
+GPU_TEST(managed_ptr, gpu_class_with_managed_ptr)
 {
   const int expectedValue = rand();
 
@@ -527,7 +527,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_managed_ptr)
 
   chai::ManagedArray<int> results(1, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = container.getValue();
   });
 
@@ -535,7 +535,7 @@ CUDA_TEST(managed_ptr, cuda_class_with_managed_ptr)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_nested_managed_ptr)
+GPU_TEST(managed_ptr, gpu_nested_managed_ptr)
 {
   const int expectedValue = rand();
 
@@ -544,7 +544,7 @@ CUDA_TEST(managed_ptr, cuda_nested_managed_ptr)
 
   chai::ManagedArray<int> results(1, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = container->getValue();
   });
 
@@ -552,7 +552,7 @@ CUDA_TEST(managed_ptr, cuda_nested_managed_ptr)
   ASSERT_EQ(results[0], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, cuda_multiple_inheritance)
+GPU_TEST(managed_ptr, gpu_multiple_inheritance)
 {
   auto derived = chai::make_managed<ClassWithMultipleInheritance>();
 
@@ -561,7 +561,7 @@ CUDA_TEST(managed_ptr, cuda_multiple_inheritance)
 
   chai::ManagedArray<bool> results(2, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = base1->isBase1();
     results[1] = base2->isBase2();
   });
@@ -573,7 +573,7 @@ CUDA_TEST(managed_ptr, cuda_multiple_inheritance)
   ASSERT_EQ(results[1], true);
 }
 
-CUDA_TEST(managed_ptr, static_pointer_cast)
+GPU_TEST(managed_ptr, static_pointer_cast)
 {
   const int expectedValue = rand();
 
@@ -589,7 +589,7 @@ CUDA_TEST(managed_ptr, static_pointer_cast)
 
   chai::ManagedArray<int> results(3, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = derived->getValue(i);
     results[1] = base->getValue(i);
     results[2] = derivedFromBase->getValue(i);
@@ -602,7 +602,7 @@ CUDA_TEST(managed_ptr, static_pointer_cast)
   ASSERT_EQ(results[2], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, dynamic_pointer_cast)
+GPU_TEST(managed_ptr, dynamic_pointer_cast)
 {
   const int expectedValue = rand();
 
@@ -618,7 +618,7 @@ CUDA_TEST(managed_ptr, dynamic_pointer_cast)
 
   chai::ManagedArray<int> results(3, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = derived->getValue(i);
     results[1] = base->getValue(i);
     results[2] = derivedFromBase->getValue(i);
@@ -631,7 +631,7 @@ CUDA_TEST(managed_ptr, dynamic_pointer_cast)
   ASSERT_EQ(results[2], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, const_pointer_cast)
+GPU_TEST(managed_ptr, const_pointer_cast)
 {
   const int expectedValue = rand();
 
@@ -647,7 +647,7 @@ CUDA_TEST(managed_ptr, const_pointer_cast)
 
   chai::ManagedArray<int> results(3, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = derived->getValue(i);
     results[1] = constDerived->getValue(i);
     results[2] = derivedFromConst->getValue(i);
@@ -660,7 +660,7 @@ CUDA_TEST(managed_ptr, const_pointer_cast)
   ASSERT_EQ(results[2], expectedValue);
 }
 
-CUDA_TEST(managed_ptr, reinterpret_pointer_cast)
+GPU_TEST(managed_ptr, reinterpret_pointer_cast)
 {
   const int expectedValue = rand();
 
@@ -676,7 +676,7 @@ CUDA_TEST(managed_ptr, reinterpret_pointer_cast)
 
   chai::ManagedArray<int> results(3, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     results[i] = derived->getValue(i);
     results[1] = base->getValue(i);
     results[2] = derivedFromBase->getValue(i);
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index 4bef0cef..f0fef889 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -42,10 +42,10 @@
 // ---------------------------------------------------------------------
 #include "gtest/gtest.h"
 
-#define CUDA_TEST(X, Y)              \
-  static void cuda_test_##X_##Y();    \
-  TEST(X, Y) { cuda_test_##X_##Y(); } \
-  static void cuda_test_##X_##Y()
+#define GPU_TEST(X, Y)              \
+  static void gpu_test_##X_##Y();    \
+  TEST(X, Y) { gpu_test_##X_##Y(); } \
+  static void gpu_test_##X_##Y()
 
 #include "chai/config.hpp"
 #include "chai/ManagedArray.hpp"
@@ -496,7 +496,7 @@ TEST(managed_ptr, reinterpret_pointer_cast)
 
 #ifdef __CUDACC__
 
-CUDA_TEST(managed_ptr, cuda_default_constructor)
+GPU_TEST(managed_ptr, gpu_default_constructor)
 {
   chai::managed_ptr<TestDerived> derived;
   chai::managed_ptr<TestDerived> otherDerived;
@@ -504,7 +504,7 @@ CUDA_TEST(managed_ptr, cuda_default_constructor)
   chai::ManagedArray<TestDerived*> array(1, chai::GPU);
   chai::ManagedArray<bool> array2(9, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived.get();
     array2[0] = (bool) derived;
     array2[1] = derived == nullptr;
@@ -532,7 +532,7 @@ CUDA_TEST(managed_ptr, cuda_default_constructor)
   EXPECT_FALSE(array2[8]);
 }
 
-CUDA_TEST(managed_ptr, cuda_nullptr_constructor)
+GPU_TEST(managed_ptr, gpu_nullptr_constructor)
 {
   chai::managed_ptr<TestDerived> derived = nullptr;
   chai::managed_ptr<TestDerived> otherDerived = nullptr;
@@ -540,7 +540,7 @@ CUDA_TEST(managed_ptr, cuda_nullptr_constructor)
   chai::ManagedArray<TestDerived*> array(1, chai::GPU);
   chai::ManagedArray<bool> array2(9, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived.get();
     array2[0] = (bool) derived;
     array2[1] = derived == nullptr;
@@ -568,7 +568,7 @@ CUDA_TEST(managed_ptr, cuda_nullptr_constructor)
   EXPECT_FALSE(array2[8]);
 }
 
-CUDA_TEST(managed_ptr, cuda_gpu_pointer_constructor)
+GPU_TEST(managed_ptr, gpu_gpu_pointer_constructor)
 {
   TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
   chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
@@ -584,7 +584,7 @@ CUDA_TEST(managed_ptr, cuda_gpu_pointer_constructor)
   chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
   chai::ManagedArray<bool> array3(5, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array1[i] = derived->getValue();
     array2[i] = derived.get();
     array3[0] = (bool) derived;
@@ -607,7 +607,7 @@ CUDA_TEST(managed_ptr, cuda_gpu_pointer_constructor)
   EXPECT_TRUE(array3[4]);
 }
 
-CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
+GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
   // Initialize host side memory to hold a pointer
   Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
@@ -654,7 +654,7 @@ CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device)
   cudaFree(gpuPointerHolder2);
 }
 
-CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device_2)
+GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
 {
   // Initialize host side memory to hold a pointer
   Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
@@ -683,7 +683,7 @@ CUDA_TEST(managed_ptr, cuda_new_and_delete_on_device_2)
   chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
 }
 
-CUDA_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
+GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
 {
   Simple* gpuPointer = chai::detail::make_on_device<Simple>(3);
   Simple* cpuPointer = new Simple(4);
@@ -694,7 +694,7 @@ CUDA_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
 
   chai::ManagedArray<int> array1(1, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array1[i] = simple->getValue();
   });
 
@@ -705,7 +705,7 @@ CUDA_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
   EXPECT_EQ(array1[0], 3);
 }
 
-CUDA_TEST(managed_ptr, cuda_cpu_and_gpu_pointer_constructor)
+GPU_TEST(managed_ptr, gpu_cpu_and_gpu_pointer_constructor)
 {
   TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
   TestDerived* cpuPointer = new TestDerived(4);
@@ -724,7 +724,7 @@ CUDA_TEST(managed_ptr, cuda_cpu_and_gpu_pointer_constructor)
   chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
   chai::ManagedArray<bool> array3(5, chai::GPU);
 
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array1[i] = derived->getValue();
     array2[i] = derived.get();
     array3[0] = (bool) derived;
@@ -747,7 +747,7 @@ CUDA_TEST(managed_ptr, cuda_cpu_and_gpu_pointer_constructor)
   EXPECT_TRUE(array3[4]);
 }
 
-CUDA_TEST(managed_ptr, cuda_make_managed)
+GPU_TEST(managed_ptr, gpu_make_managed)
 {
   const int expectedValue = rand();
   auto derived = chai::make_managed<TestDerived>(expectedValue);
@@ -756,7 +756,7 @@ CUDA_TEST(managed_ptr, cuda_make_managed)
   chai::ManagedArray<TestDerived*> array2(1, chai::GPU);
   chai::ManagedArray<bool> array3(7, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived->getValue();
     array2[i] = derived.get();
     array3[0] = (bool) derived;
@@ -780,7 +780,7 @@ CUDA_TEST(managed_ptr, cuda_make_managed)
   EXPECT_TRUE(array3[4]);
 }
 
-CUDA_TEST(managed_ptr, make_managed_from_factory_function)
+GPU_TEST(managed_ptr, make_managed_from_factory_function)
 {
   const int expectedValue = rand();
 
@@ -800,7 +800,7 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_function)
   EXPECT_TRUE(nullptr != derived);
 }
 
-CUDA_TEST(managed_ptr, make_managed_from_factory_lambda)
+GPU_TEST(managed_ptr, make_managed_from_factory_lambda)
 {
   const int expectedValue = rand();
 
@@ -820,7 +820,7 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_lambda)
   EXPECT_TRUE(nullptr != derived);
 }
 
-CUDA_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
+GPU_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
 {
   const int expectedValue = rand();
 
@@ -840,7 +840,7 @@ CUDA_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
   EXPECT_TRUE(nullptr != derived);
 }
 
-CUDA_TEST(managed_ptr, make_managed_from_factory_static_member_function)
+GPU_TEST(managed_ptr, make_managed_from_factory_static_member_function)
 {
   const int expectedValue = rand();
 
@@ -860,7 +860,7 @@ CUDA_TEST(managed_ptr, make_managed_from_factory_static_member_function)
   EXPECT_TRUE(nullptr != derived);
 }
 
-CUDA_TEST(managed_ptr, cuda_copy_constructor)
+GPU_TEST(managed_ptr, gpu_copy_constructor)
 {
   const int expectedValue = rand();
   auto derived = chai::make_managed<TestDerived>(expectedValue);
@@ -870,7 +870,7 @@ CUDA_TEST(managed_ptr, cuda_copy_constructor)
   chai::ManagedArray<TestDerived*> array2(2, chai::GPU);
   chai::ManagedArray<bool> array3(14, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived->getValue();
     array2[0] = derived.get();
     array3[0] = (bool) derived;
@@ -918,7 +918,7 @@ CUDA_TEST(managed_ptr, cuda_copy_constructor)
   EXPECT_FALSE(array3[13]);
 }
 
-CUDA_TEST(managed_ptr, cuda_converting_constructor)
+GPU_TEST(managed_ptr, gpu_converting_constructor)
 {
   const int expectedValue = rand();
   auto derived = chai::make_managed<TestDerived>(expectedValue);
@@ -928,7 +928,7 @@ CUDA_TEST(managed_ptr, cuda_converting_constructor)
   chai::ManagedArray<TestBase*> array2(2, chai::GPU);
   chai::ManagedArray<bool> array3(14, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived->getValue();
     array2[0] = derived.get();
     array3[0] = (bool) derived;
@@ -976,7 +976,7 @@ CUDA_TEST(managed_ptr, cuda_converting_constructor)
   EXPECT_FALSE(array3[13]);
 }
 
-CUDA_TEST(managed_ptr, cuda_copy_assignment_operator)
+GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
 {
   const int expectedValue = rand();
   auto derived = chai::make_managed<TestDerived>(expectedValue);
@@ -987,7 +987,7 @@ CUDA_TEST(managed_ptr, cuda_copy_assignment_operator)
   chai::ManagedArray<TestDerived*> array2(2, chai::GPU);
   chai::ManagedArray<bool> array3(14, chai::GPU);
   
-  forall(cuda(), 0, 1, [=] __device__ (int i) {
+  forall(gpu(), 0, 1, [=] __device__ (int i) {
     array[i] = derived->getValue();
     array2[0] = derived.get();
     array3[0] = (bool) derived;

From 6893e8206a02feeb0756f98a32a651666fd3fa43 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 15:17:22 -0700
Subject: [PATCH 28/58] Fix managed_ptr unit tests in the cuda build

---
 tests/unit/managed_ptr_unit_tests.cpp | 69 +++++++++++++++++++--------
 1 file changed, 49 insertions(+), 20 deletions(-)

diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index f0fef889..2dd12b9a 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -530,6 +530,9 @@ GPU_TEST(managed_ptr, gpu_default_constructor)
   EXPECT_TRUE(array2[6]);
   EXPECT_FALSE(array2[7]);
   EXPECT_FALSE(array2[8]);
+
+  array.free();
+  array2.free();
 }
 
 GPU_TEST(managed_ptr, gpu_nullptr_constructor)
@@ -566,9 +569,12 @@ GPU_TEST(managed_ptr, gpu_nullptr_constructor)
   EXPECT_TRUE(array2[6]);
   EXPECT_FALSE(array2[7]);
   EXPECT_FALSE(array2[8]);
+
+  array.free();
+  array2.free();
 }
 
-GPU_TEST(managed_ptr, gpu_gpu_pointer_constructor)
+GPU_TEST(managed_ptr, gpu_pointer_constructor)
 {
   TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
   chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
@@ -605,6 +611,10 @@ GPU_TEST(managed_ptr, gpu_gpu_pointer_constructor)
   EXPECT_FALSE(array3[2]);
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
+
+  array1.free();
+  array2.free();
+  array3.free();
 }
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
@@ -633,25 +643,7 @@ GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
   // Free host side memory
   free(cpuPointerHolder);
 
-  // Initialize more host side memory
-  Simple** cpuPointerHolder2 = (Simple**) malloc(sizeof(Simple*));
-  cpuPointerHolder2[0] = gpuPointer;
-
-  // Initialize more device side memory
-  Simple** gpuPointerHolder2 = nullptr;
-  cudaMalloc(&gpuPointerHolder2, sizeof(Simple*));
-
-  // Copy pointer back to the device
-  cudaMemcpy(gpuPointerHolder2, cpuPointerHolder2, sizeof(Simple*),
-             cudaMemcpyHostToDevice);
-
-  chai::detail::destroy_on_device<<<1, 1>>>(gpuPointerHolder2);
-
-  // Free host memory
-  free(cpuPointerHolder2);
-
-  // Free device memory
-  cudaFree(gpuPointerHolder2);
+  chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
@@ -681,6 +673,7 @@ GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
   free(cpuPointerHolder);
 
   chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
+  test.free();
 }
 
 GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
@@ -703,6 +696,9 @@ GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
   cudaDeviceSynchronize();
 
   EXPECT_EQ(array1[0], 3);
+
+  array1.free();
+  simple.free();
 }
 
 GPU_TEST(managed_ptr, gpu_cpu_and_gpu_pointer_constructor)
@@ -745,6 +741,11 @@ GPU_TEST(managed_ptr, gpu_cpu_and_gpu_pointer_constructor)
   EXPECT_FALSE(array3[2]);
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
+
+  array1.free();
+  array2.free();
+  array3.free();
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_make_managed)
@@ -778,6 +779,11 @@ GPU_TEST(managed_ptr, gpu_make_managed)
   EXPECT_FALSE(array3[2]);
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
+
+  array.free();
+  array2.free();
+  array3.free();
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, make_managed_from_factory_function)
@@ -798,6 +804,8 @@ GPU_TEST(managed_ptr, make_managed_from_factory_function)
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, make_managed_from_factory_lambda)
@@ -818,6 +826,8 @@ GPU_TEST(managed_ptr, make_managed_from_factory_lambda)
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
@@ -838,6 +848,8 @@ GPU_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, make_managed_from_factory_static_member_function)
@@ -858,6 +870,8 @@ GPU_TEST(managed_ptr, make_managed_from_factory_static_member_function)
   EXPECT_FALSE(nullptr == derived);
   EXPECT_TRUE(derived != nullptr);
   EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_copy_constructor)
@@ -916,6 +930,11 @@ GPU_TEST(managed_ptr, gpu_copy_constructor)
   EXPECT_TRUE(array3[11]);
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
+
+  array.free();
+  array2.free();
+  array3.free();
+  otherDerived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_converting_constructor)
@@ -974,6 +993,11 @@ GPU_TEST(managed_ptr, gpu_converting_constructor)
   EXPECT_TRUE(array3[11]);
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
+
+  array.free();
+  array2.free();
+  array3.free();
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
@@ -1033,6 +1057,11 @@ GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
   EXPECT_TRUE(array3[11]);
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
+
+  array.free();
+  array2.free();
+  array3.free();
+  otherDerived.free();
 }
 
 #endif

From 1e8c44c11802e4f0a9e1946eaebdf02407926696 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 15:25:21 -0700
Subject: [PATCH 29/58] Fix memory leaks in integration tests

---
 tests/integration/managed_ptr_tests.cpp | 44 +++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index 94982121..b27ea4c0 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -407,6 +407,8 @@ GPU_TEST(managed_ptr, pass_object_to_kernel)
   array.move(chai::CPU);
   cudaDeviceSynchronize();
   ASSERT_EQ(array[0], -1);
+
+  array.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_raw_array)
@@ -428,6 +430,10 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array)
 
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
+
+  array.free();
+  rawArrayClass.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback)
@@ -470,6 +476,9 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback)
 
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
+
+  results.free();
+  managedPointer.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_managed_array)
@@ -493,6 +502,10 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_array)
   results.move(chai::CPU);
 
   ASSERT_EQ(results[0], expectedValue);
+
+  array.free();
+  derived.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
@@ -516,6 +529,11 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
 
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
+
+  array.free();
+  rawArrayClass.free();
+  rawPointerClass.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_managed_ptr)
@@ -533,6 +551,9 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_ptr)
 
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
+
+  derived.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, gpu_nested_managed_ptr)
@@ -550,6 +571,10 @@ GPU_TEST(managed_ptr, gpu_nested_managed_ptr)
 
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
+
+  derived.free();
+  container.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, gpu_multiple_inheritance)
@@ -571,6 +596,9 @@ GPU_TEST(managed_ptr, gpu_multiple_inheritance)
 
   ASSERT_EQ(results[0], true);
   ASSERT_EQ(results[1], true);
+
+  derived.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, static_pointer_cast)
@@ -600,6 +628,10 @@ GPU_TEST(managed_ptr, static_pointer_cast)
   ASSERT_EQ(results[0], expectedValue);
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
+
+  array.free();
+  derived.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, dynamic_pointer_cast)
@@ -629,6 +661,10 @@ GPU_TEST(managed_ptr, dynamic_pointer_cast)
   ASSERT_EQ(results[0], expectedValue);
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
+
+  array.free();
+  base.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, const_pointer_cast)
@@ -658,6 +694,10 @@ GPU_TEST(managed_ptr, const_pointer_cast)
   ASSERT_EQ(results[0], expectedValue);
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
+
+  array.free();
+  derivedFromConst.free();
+  results.free();
 }
 
 GPU_TEST(managed_ptr, reinterpret_pointer_cast)
@@ -687,6 +727,10 @@ GPU_TEST(managed_ptr, reinterpret_pointer_cast)
   ASSERT_EQ(results[0], expectedValue);
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
+
+  array.free();
+  derived.free();
+  results.free();
 }
 
 #endif

From 18a88009be4644374f8e8978c9fb732eccc722c1 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Tue, 1 Oct 2019 15:27:33 -0700
Subject: [PATCH 30/58] Remove unnecessary synchronizes

---
 tests/integration/managed_ptr_tests.cpp | 5 -----
 tests/unit/managed_ptr_unit_tests.cpp   | 2 --
 2 files changed, 7 deletions(-)

diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index b27ea4c0..36036d96 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -320,12 +320,10 @@ GPU_TEST(managed_ptr, make_on_device)
   deviceNew<<<1, 1>>>(deviceArray);
 
   cudaMemcpy(hostArray, deviceArray, sizeof(int*), cudaMemcpyDeviceToHost);
-  cudaDeviceSynchronize();
   cudaMemcpy(deviceArray2, hostArray, sizeof(int*), cudaMemcpyHostToDevice);
   ASSERT_NE(hostArray[0], nullptr);
 
   deviceDelete<<<1, 1>>>(deviceArray2);
-  cudaDeviceSynchronize();
   free(hostArray);
   cudaFree(deviceArray);
   cudaFree(deviceArray2);
@@ -403,9 +401,7 @@ GPU_TEST(managed_ptr, pass_object_to_kernel)
   chai::ArrayManager* manager = chai::ArrayManager::getInstance();
   manager->setExecutionSpace(chai::GPU);
   passObjectToKernel<<<1, 1>>>(array);
-  cudaDeviceSynchronize();
   array.move(chai::CPU);
-  cudaDeviceSynchronize();
   ASSERT_EQ(array[0], -1);
 
   array.free();
@@ -592,7 +588,6 @@ GPU_TEST(managed_ptr, gpu_multiple_inheritance)
   });
 
   results.move(chai::CPU);
-  cudaDeviceSynchronize();
 
   ASSERT_EQ(results[0], true);
   ASSERT_EQ(results[1], true);
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index 2dd12b9a..721e9b40 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -693,8 +693,6 @@ GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
 
   array1.move(chai::CPU);
 
-  cudaDeviceSynchronize();
-
   EXPECT_EQ(array1[0], 3);
 
   array1.free();

From 87676b4f8d0e2ebbe662706c173a77abccd33101 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Wed, 9 Oct 2019 17:15:26 -0700
Subject: [PATCH 31/58] Add benchmarks

---
 benchmarks/CMakeLists.txt                  |  10 ++
 benchmarks/chai_managed_ptr_benchmarks.cpp | 111 +++++++++++++++++++++
 2 files changed, 121 insertions(+)
 create mode 100644 benchmarks/chai_managed_ptr_benchmarks.cpp

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 6795f860..726cbf77 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -73,3 +73,13 @@ blt_add_executable(
 blt_add_benchmark(
   NAME managedarray_benchmarks
   COMMAND managedarray_benchmarks)
+
+blt_add_executable(
+  NAME managed_ptr_benchmarks
+  SOURCES chai_managed_ptr_benchmarks.cpp
+  DEPENDS_ON ${chai_benchmark_depends})
+
+blt_add_benchmark(
+  NAME managed_ptr_benchmarks
+  COMMAND managed_ptr_benchmarks)
+
diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
new file mode 100644
index 00000000..e9e603ec
--- /dev/null
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -0,0 +1,111 @@
+// ---------------------------------------------------------------------
+// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
+// rights reserved.
+//
+// Produced at the Lawrence Livermore National Laboratory.
+//
+// This file is part of CHAI.
+//
+// LLNL-CODE-705877
+//
+// For details, see https:://github.com/LLNL/CHAI
+// Please also see the NOTICE and LICENSE files.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+// - Redistributions of source code must retain the above copyright
+//   notice, this list of conditions and the following disclaimer.
+//
+// - Redistributions in binary form must reproduce the above copyright
+//   notice, this list of conditions and the following disclaimer in the
+//   documentation and/or other materials provided with the
+//   distribution.
+//
+// - Neither the name of the LLNS/LLNL nor the names of its contributors
+//   may be used to endorse or promote products derived from this
+//   software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+// ---------------------------------------------------------------------
+#include <climits>
+
+#include "benchmark/benchmark_api.h"
+
+#include "chai/config.hpp"
+#include "chai/managed_ptr.hpp"
+
+#include "../src/util/forall.hpp"
+
+class Base {
+   public:
+      CHAI_HOST_DEVICE virtual int getValue() const = 0;
+};
+
+class Derived : public Base {
+   public:
+      CHAI_HOST_DEVICE Derived(int value) : Base(), m_value(value) {}
+
+      CHAI_HOST_DEVICE int getValue() const override { return m_value; }
+
+   private:
+      int m_value = -1;
+};
+
+void benchmark_managed_ptr_construction_and_destruction(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    chai::managed_ptr<Base> temp = chai::make_managed<Derived>(state.range(0));
+    temp.free();
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_managed_ptr_construction_and_destruction)->Range(1, 1);
+
+static chai::managed_ptr<Base> helper1 = chai::make_managed<Derived>(1);
+
+void benchmark_managed_ptr_use_cpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = helper1;
+    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_managed_ptr_use_cpu)->Range(1, 1);
+
+#if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
+
+static chai::managed_ptr<Base> helper2 = chai::make_managed<Derived>(2);
+
+void benchmark_managed_ptr_use_gpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = helper2;
+    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper->getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_managed_ptr_use_gpu)->Range(1, 1);
+
+#endif
+
+BENCHMARK_MAIN();

From 7bf07616e91e88e432f322d0599c377db351766d Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 10 Oct 2019 09:55:46 -0700
Subject: [PATCH 32/58] Add comparison benchmarks for managed_ptr

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 88 ++++++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index e9e603ec..a36aa8b8 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -64,6 +64,34 @@ class Derived : public Base {
       int m_value = -1;
 };
 
+template <typename T>
+class BaseCRTP {
+   public:
+      CHAI_HOST_DEVICE int getValue() const {
+         return static_cast<const T*>(this)->getValue();
+      }
+};
+
+class DerivedCRTP : public BaseCRTP<DerivedCRTP> {
+   public:
+      CHAI_HOST_DEVICE DerivedCRTP(int value) : BaseCRTP<DerivedCRTP>(), m_value(value) {}
+
+      CHAI_HOST_DEVICE int getValue() const { return m_value; }
+
+   private:
+      int m_value = -1;
+};
+
+class NoInheritance {
+   public:
+      CHAI_HOST_DEVICE NoInheritance(int value) : m_value(value) {}
+
+      CHAI_HOST_DEVICE int getValue() const { return m_value; }
+
+   private:
+      int m_value = -1;
+};
+
 void benchmark_managed_ptr_construction_and_destruction(benchmark::State& state)
 {
   while (state.KeepRunning()) {
@@ -90,6 +118,36 @@ void benchmark_managed_ptr_use_cpu(benchmark::State& state)
 
 BENCHMARK(benchmark_managed_ptr_use_cpu)->Range(1, 1);
 
+// Curiously recurring template pattern
+static BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(3);
+
+void benchmark_curiously_recurring_template_pattern_cpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = derivedCRTP;
+    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_curiously_recurring_template_pattern_cpu)->Range(1, 1);
+
+// Class without inheritance
+static NoInheritance* noInheritance = new NoInheritance(5);
+
+void benchmark_no_inheritance_cpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = noInheritance;
+    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_no_inheritance_cpu)->Range(1, 1);
+
 #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
 
 static chai::managed_ptr<Base> helper2 = chai::make_managed<Derived>(2);
@@ -106,6 +164,36 @@ void benchmark_managed_ptr_use_gpu(benchmark::State& state)
 
 BENCHMARK(benchmark_managed_ptr_use_gpu)->Range(1, 1);
 
+// Curiously recurring template pattern
+static BaseCRTP<DerivedCRTP>* derivedCRTP2 = new DerivedCRTP(4);
+
+void benchmark_curiously_recurring_template_pattern_gpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = *derivedCRTP2;
+    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_curiously_recurring_template_pattern_gpu)->Range(1, 1);
+
+// Class without inheritance
+static NoInheritance* noInheritance2 = new NoInheritance(5);
+
+void benchmark_no_inheritance_gpu(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    auto helper = *noInheritance2;
+    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
+  }
+
+  state.SetItemsProcessed(state.iterations());
+}
+
+BENCHMARK(benchmark_no_inheritance_gpu)->Range(1, 1);
+
 #endif
 
 BENCHMARK_MAIN();

From 795947c53d947f36256640b22813cfc56c68b70c Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 10 Oct 2019 14:36:15 -0700
Subject: [PATCH 33/58] Add a lot more benchmarks to compare different
 approaches

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 555 +++++++++++++++++++--
 1 file changed, 517 insertions(+), 38 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index a36aa8b8..3cdaaacf 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -42,7 +42,7 @@
 // ---------------------------------------------------------------------
 #include <climits>
 
-#include "benchmark/benchmark_api.h"
+#include "benchmark/benchmark.h"
 
 #include "chai/config.hpp"
 #include "chai/managed_ptr.hpp"
@@ -92,107 +92,586 @@ class NoInheritance {
       int m_value = -1;
 };
 
-void benchmark_managed_ptr_construction_and_destruction(benchmark::State& state)
+template <size_t N>
+class ClassWithSize {
+   private:
+      char m_values[N];
+};
+
+static void benchmark_managed_ptr_construction_and_destruction(benchmark::State& state)
 {
   while (state.KeepRunning()) {
-    chai::managed_ptr<Base> temp = chai::make_managed<Derived>(state.range(0));
+    chai::managed_ptr<Base> temp = chai::make_managed<Derived>(1);
     temp.free();
   }
-
-  state.SetItemsProcessed(state.iterations());
 }
 
-BENCHMARK(benchmark_managed_ptr_construction_and_destruction)->Range(1, 1);
+BENCHMARK(benchmark_managed_ptr_construction_and_destruction);
 
-static chai::managed_ptr<Base> helper1 = chai::make_managed<Derived>(1);
-
-void benchmark_managed_ptr_use_cpu(benchmark::State& state)
+// managed_ptr
+static void benchmark_managed_ptr_use_cpu(benchmark::State& state)
 {
+  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(1);
+
   while (state.KeepRunning()) {
-    auto helper = helper1;
     forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  helper.free();
 }
 
-BENCHMARK(benchmark_managed_ptr_use_cpu)->Range(1, 1);
+BENCHMARK(benchmark_managed_ptr_use_cpu);
 
 // Curiously recurring template pattern
-static BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(3);
-
-void benchmark_curiously_recurring_template_pattern_cpu(benchmark::State& state)
+static void benchmark_curiously_recurring_template_pattern_cpu(benchmark::State& state)
 {
+  BaseCRTP<DerivedCRTP>* helper = new DerivedCRTP(3);
+
   while (state.KeepRunning()) {
-    auto helper = derivedCRTP;
     forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  delete helper;
 }
 
-BENCHMARK(benchmark_curiously_recurring_template_pattern_cpu)->Range(1, 1);
+BENCHMARK(benchmark_curiously_recurring_template_pattern_cpu);
 
 // Class without inheritance
-static NoInheritance* noInheritance = new NoInheritance(5);
-
-void benchmark_no_inheritance_cpu(benchmark::State& state)
+static void benchmark_no_inheritance_cpu(benchmark::State& state)
 {
+  NoInheritance* helper = new NoInheritance(5);
+
   while (state.KeepRunning()) {
-    auto helper = noInheritance;
     forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  delete helper;
 }
 
-BENCHMARK(benchmark_no_inheritance_cpu)->Range(1, 1);
+BENCHMARK(benchmark_no_inheritance_cpu);
 
 #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
 
-static chai::managed_ptr<Base> helper2 = chai::make_managed<Derived>(2);
+template <size_t N>
+__global__ void copy_kernel(ClassWithSize<N>) {}
+
+// Benchmark how long it takes to copy a class to the GPU
+void benchmark_pass_copy_to_gpu_8(benchmark::State& state)
+{
+  ClassWithSize<8> helper;
+
+  while (state.KeepRunning()) {
+    copy_kernel<<<1, 1>>>(helper);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_pass_copy_to_gpu_8);
+
+void benchmark_pass_copy_to_gpu_64(benchmark::State& state)
+{
+  ClassWithSize<64> helper;
+
+  while (state.KeepRunning()) {
+    copy_kernel<<<1, 1>>>(helper);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_pass_copy_to_gpu_64);
+
+void benchmark_pass_copy_to_gpu_512(benchmark::State& state)
+{
+  ClassWithSize<512> helper;
+
+  while (state.KeepRunning()) {
+    copy_kernel<<<1, 1>>>(helper);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_pass_copy_to_gpu_512);
+
+void benchmark_pass_copy_to_gpu_4096(benchmark::State& state)
+{
+  ClassWithSize<4096> helper;
+
+  while (state.KeepRunning()) {
+    copy_kernel<<<1, 1>>>(helper);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_pass_copy_to_gpu_4096);
 
 void benchmark_managed_ptr_use_gpu(benchmark::State& state)
 {
+  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(2);
+
   while (state.KeepRunning()) {
-    auto helper = helper2;
     forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper->getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  helper.free();
 }
 
-BENCHMARK(benchmark_managed_ptr_use_gpu)->Range(1, 1);
+// Benchmark how long it takes to call placement new on the GPU
+template <size_t N>
+__global__ void placement_new_kernel(ClassWithSize<N>* address) {
+   (void) new(address) ClassWithSize<N>();
+}
 
-// Curiously recurring template pattern
-static BaseCRTP<DerivedCRTP>* derivedCRTP2 = new DerivedCRTP(4);
+template <size_t N>
+__global__ void placement_delete_kernel(ClassWithSize<N>* address) {
+   address->~ClassWithSize<N>();
+}
+
+void benchmark_placement_new_on_gpu_8(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<8>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<8>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_8);
+
+void benchmark_placement_new_on_gpu_64(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<64>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<64>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_64);
+
+void benchmark_placement_new_on_gpu_512(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<512>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<512>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_512);
+
+void benchmark_placement_new_on_gpu_4096(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<4096>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<4096>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_4096);
+
+void benchmark_placement_new_on_gpu_32768(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<32768>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<32768>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_32768);
+
+void benchmark_placement_new_on_gpu_262144(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<262144>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<262144>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_262144);
+
+void benchmark_placement_new_on_gpu_2097152(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<2097152>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<2097152>));
+    placement_new_kernel<<<1, 1>>>(address);
+    placement_delete_kernel<<<1, 1>>>(address);
+    cudaFree(address);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_placement_new_on_gpu_2097152);
+
+// Benchmark how long it takes to call new on the GPU
+template <size_t N>
+__global__ void create_kernel(ClassWithSize<N>** address) {
+   *address = new ClassWithSize<N>();
+}
+
+template <size_t N>
+__global__ void delete_kernel(ClassWithSize<N>** address) {
+   delete *address;
+}
+
+void benchmark_new_on_gpu_8(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<8>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<8>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_8);
+
+void benchmark_new_on_gpu_64(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<64>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<64>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_64);
+
+void benchmark_new_on_gpu_512(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<512>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<512>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_512);
+
+void benchmark_new_on_gpu_4096(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<4096>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<4096>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_4096);
 
+void benchmark_new_on_gpu_32768(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<32768>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<32768>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_32768);
+
+void benchmark_new_on_gpu_262144(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<262144>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<262144>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_262144);
+
+void benchmark_new_on_gpu_2097152(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<2097152>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<2097152>*));
+    create_kernel<<<1, 1>>>(buffer);
+    delete_kernel<<<1, 1>>>(buffer);
+    cudaFree(buffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_2097152);
+
+// Benchmark current approach
+template <size_t N>
+__global__ void delete_kernel_2(ClassWithSize<N>* address) {
+   delete address;
+}
+
+void benchmark_new_on_gpu_and_copy_to_host_8(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<8>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<8>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<8>** cpuBuffer = (ClassWithSize<8>**) malloc(sizeof(ClassWithSize<8>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<8>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<8>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_8);
+
+void benchmark_new_on_gpu_and_copy_to_host_64(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<64>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<64>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<64>** cpuBuffer = (ClassWithSize<64>**) malloc(sizeof(ClassWithSize<64>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<64>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<64>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_64);
+
+void benchmark_new_on_gpu_and_copy_to_host_512(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<512>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<512>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<512>** cpuBuffer = (ClassWithSize<512>**) malloc(sizeof(ClassWithSize<512>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<512>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<512>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_512);
+
+void benchmark_new_on_gpu_and_copy_to_host_4096(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<4096>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<4096>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<4096>** cpuBuffer = (ClassWithSize<4096>**) malloc(sizeof(ClassWithSize<4096>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<4096>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<4096>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_4096);
+
+void benchmark_new_on_gpu_and_copy_to_host_32768(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<32768>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<32768>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<32768>** cpuBuffer = (ClassWithSize<32768>**) malloc(sizeof(ClassWithSize<32768>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<32768>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<32768>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_32768);
+
+void benchmark_new_on_gpu_and_copy_to_host_262144(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<262144>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<262144>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<262144>** cpuBuffer = (ClassWithSize<262144>**) malloc(sizeof(ClassWithSize<262144>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<262144>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<262144>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_262144);
+
+void benchmark_new_on_gpu_and_copy_to_host_2097152(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    ClassWithSize<2097152>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<2097152>*));
+    create_kernel<<<1, 1>>>(gpuBuffer);
+    ClassWithSize<2097152>** cpuBuffer = (ClassWithSize<2097152>**) malloc(sizeof(ClassWithSize<2097152>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<2097152>*), cudaMemcpyDeviceToHost);
+    cudaFree(gpuBuffer);
+    ClassWithSize<2097152>* gpuPointer = cpuBuffer[0];
+    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    free(cpuBuffer);
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_2097152);
+
+// Benchmark how long it takes to create a stack object on the GPU
+template <size_t N>
+__global__ void create_on_stack_kernel() {
+   (void) ClassWithSize<N>();
+}
+
+void benchmark_create_on_stack_on_gpu_8(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<8><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_8);
+
+void benchmark_create_on_stack_on_gpu_64(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<64><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_64);
+
+void benchmark_create_on_stack_on_gpu_512(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<512><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_512);
+
+void benchmark_create_on_stack_on_gpu_4096(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<4096><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_4096);
+
+void benchmark_create_on_stack_on_gpu_32768(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<32768><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_32768);
+
+void benchmark_create_on_stack_on_gpu_262144(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<262144><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_262144);
+
+void benchmark_create_on_stack_on_gpu_2097152(benchmark::State& state)
+{
+  while (state.KeepRunning()) {
+    create_on_stack_kernel<2097152><<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+}
+
+BENCHMARK(benchmark_create_on_stack_on_gpu_2097152);
+
+BENCHMARK(benchmark_managed_ptr_use_gpu);
+
+// Curiously recurring template pattern
 void benchmark_curiously_recurring_template_pattern_gpu(benchmark::State& state)
 {
+  BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(4);
+  auto helper = *derivedCRTP;
+
   while (state.KeepRunning()) {
-    auto helper = *derivedCRTP2;
     forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  delete derivedCRTP;
 }
 
-BENCHMARK(benchmark_curiously_recurring_template_pattern_gpu)->Range(1, 1);
+BENCHMARK(benchmark_curiously_recurring_template_pattern_gpu);
 
 // Class without inheritance
-static NoInheritance* noInheritance2 = new NoInheritance(5);
-
 void benchmark_no_inheritance_gpu(benchmark::State& state)
 {
+  NoInheritance* noInheritance = new NoInheritance(5);
+  auto helper = *noInheritance;
+
   while (state.KeepRunning()) {
-    auto helper = *noInheritance2;
     forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
   }
 
-  state.SetItemsProcessed(state.iterations());
+  delete noInheritance;
 }
 
-BENCHMARK(benchmark_no_inheritance_gpu)->Range(1, 1);
+BENCHMARK(benchmark_no_inheritance_gpu);
 
 #endif
 

From 349fbddc81127d003a2310025d11f5fff8b37fc1 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 10 Oct 2019 15:21:38 -0700
Subject: [PATCH 34/58] Use placement new for better performance and cleaner
 code

---
 src/chai/managed_ptr.hpp                | 30 ++++---------
 tests/integration/managed_ptr_tests.cpp | 57 ++++++++-----------------
 tests/unit/managed_ptr_unit_tests.cpp   | 56 +++++++-----------------
 3 files changed, 42 insertions(+), 101 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index c9c0de66..190df7cf 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -818,8 +818,8 @@ namespace chai {
       template <typename T,
                 typename... Args,
                 typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
-         *gpuPointer = new T(args...);
+      CHAI_DEVICE void new_on_device(T* gpuPointer, Args&&... args) {
+         new(gpuPointer) T(args...);
       }
 
       ///
@@ -836,8 +836,8 @@ namespace chai {
       template <typename T,
                 typename... Args,
                 typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
-         *gpuPointer = new T(getRawPointers(args)...);
+      CHAI_DEVICE void new_on_device(T* gpuPointer, Args&&... args) {
+         new(gpuPointer) T(getRawPointers(args)...);
       }
 
       ///
@@ -853,7 +853,7 @@ namespace chai {
       ///
       template <typename T,
                 typename... Args>
-      __global__ void make_on_device(T** gpuPointer, Args... args)
+      __global__ void make_on_device(T* gpuPointer, Args... args)
       {
          new_on_device(gpuPointer, args...);
       }
@@ -890,7 +890,7 @@ namespace chai {
       __global__ void destroy_on_device(T* gpuPointer)
       {
          if (gpuPointer) {
-            delete gpuPointer;
+            gpuPointer->~T();
          }
       }
 
@@ -917,25 +917,13 @@ namespace chai {
 #endif
 
          // Allocate space on the GPU to hold the pointer to the new object
-         T** gpuBuffer;
-         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
+         T* gpuPointer;
+         GPU_ERROR_CHECK(cudaMalloc(&gpuPointer, sizeof(T)));
 
          // Create the object on the device
-         make_on_device<<<1, 1>>>(gpuBuffer, args...);
+         make_on_device<<<1, 1>>>(gpuPointer, args...);
          debug_cudaDeviceSynchronize();
 
-         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
-         T** cpuBuffer = (T**) malloc(sizeof(T*));
-         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
-                                    cudaMemcpyDeviceToHost));
-
-         // Get the GPU pointer
-         T* gpuPointer = cpuBuffer[0];
-
-         // Free the host and device buffers
-         free(cpuBuffer);
-         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
-
 #ifndef CHAI_DISABLE_RM
          // Set the execution space back to the previous value
          arrayManager->setExecutionSpace(currentSpace);
diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index 36036d96..908074fa 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -331,60 +331,37 @@ GPU_TEST(managed_ptr, make_on_device)
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
-  // Initialize host side memory to hold a pointer
-  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
-  cpuPointerHolder[0] = nullptr;
-
-  // Initialize device side memory to hold a pointer
-  RawArrayClass** gpuPointerHolder = nullptr;
-  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
+  // Initialize device side memory to hold the new object
+  RawArrayClass* gpuPointer = nullptr;
+  cudaMalloc(&gpuPointer, sizeof(RawArrayClass));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
-
-  // Copy to the host side memory
-  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
-
-  // Free device side memory
-  cudaFree(gpuPointerHolder);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
 
-  // Save the pointer
-  ASSERT_NE(cpuPointerHolder[0], nullptr);
-  RawArrayClass* gpuPointer = cpuPointerHolder[0];
-
-  // Free host side memory
-  free(cpuPointerHolder);
+  // Check the pointer
+  ASSERT_NE(gpuPointer, nullptr);
 
+  // Clean up on the device
   chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
 GPU_TEST(managed_ptr, gpu_build_managed_ptr)
 {
-  // Initialize host side memory to hold a pointer
-  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
-  cpuPointerHolder[0] = nullptr;
-
-  // Initialize device side memory to hold a pointer
-  RawArrayClass** gpuPointerHolder = nullptr;
-  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
+  // Initialize device side memory to hold the new object
+  RawArrayClass* gpuPointer = nullptr;
+  cudaMalloc(&gpuPointer, sizeof(RawArrayClass));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
 
-  // Copy to the host side memory
-  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
-
-  // Free device side memory
-  cudaFree(gpuPointerHolder);
-
-  // Save the pointer
-  ASSERT_NE(cpuPointerHolder[0], nullptr);
-  RawArrayClass* gpuPointer = cpuPointerHolder[0];
-
-  // Free host side memory
-  free(cpuPointerHolder);
+  // Check the pointer
+  ASSERT_NE(gpuPointer, nullptr);
 
+  // Make a managed_ptr
   chai::managed_ptr<RawArrayClass> managedPtr({chai::GPU}, {gpuPointer});
+
+  // Clean up the memory
+  managedPtr.free();
 }
 
 
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index 721e9b40..b8ac4ec4 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -619,60 +619,36 @@ GPU_TEST(managed_ptr, gpu_pointer_constructor)
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
-  // Initialize host side memory to hold a pointer
-  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
-  cpuPointerHolder[0] = nullptr;
-
-  // Initialize device side memory to hold a pointer
-  Simple** gpuPointerHolder = nullptr;
-  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
+  // Initialize device side memory to hold the new object
+  Simple* gpuPointer = nullptr;
+  cudaMalloc(&gpuPointer, sizeof(Simple));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
-
-  // Copy to the host side memory
-  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
-
-  // Free device side memory
-  cudaFree(gpuPointerHolder);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
 
-  // Save the pointer
-  ASSERT_NE(cpuPointerHolder[0], nullptr);
-  Simple* gpuPointer = cpuPointerHolder[0];
-
-  // Free host side memory
-  free(cpuPointerHolder);
+  // Check the pointer
+  ASSERT_NE(gpuPointer, nullptr);
 
+  // Clean up on the device
   chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
 {
-  // Initialize host side memory to hold a pointer
-  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
-  cpuPointerHolder[0] = nullptr;
-
-  // Initialize device side memory to hold a pointer
-  Simple** gpuPointerHolder = nullptr;
-  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
+  // Initialize device side memory to hold a the new object
+  Simple* gpuPointer = nullptr;
+  cudaMalloc(&gpuPointer, sizeof(Simple));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
 
-  // Copy to the host side memory
-  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
-
-  // Free device side memory
-  cudaFree(gpuPointerHolder);
-
-  // Save the pointer
-  ASSERT_NE(cpuPointerHolder[0], nullptr);
-  Simple* gpuPointer = cpuPointerHolder[0];
-
-  // Free host side memory
-  free(cpuPointerHolder);
+  // Check the pointer
+  ASSERT_NE(gpuPointer, nullptr);
 
+  // Create a managed_ptr
   chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
+
+  // Free the memory
   test.free();
 }
 

From ac3bc32ee60f00d9b6956bf78bccc35328f4f0ae Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 10 Oct 2019 15:36:50 -0700
Subject: [PATCH 35/58] Fix memory leaks

---
 src/chai/managed_ptr.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 190df7cf..a0e90966 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -554,6 +554,7 @@ namespace chai {
                               if (pointer) {
                                  detail::destroy_on_device<<<1, 1>>>(temp);
                                  debug_cudaDeviceSynchronize();
+                                 GPU_ERROR_CHECK(cudaFree(temp));
                               }
 
                               break;
@@ -581,6 +582,7 @@ namespace chai {
                            if (pointer) {
                               detail::destroy_on_device<<<1, 1>>>(pointer);
                               debug_cudaDeviceSynchronize();
+                              GPU_ERROR_CHECK(cudaFree((void*) pointer));
                            }
 
                            break;
@@ -916,7 +918,7 @@ namespace chai {
          arrayManager->setExecutionSpace(GPU);
 #endif
 
-         // Allocate space on the GPU to hold the pointer to the new object
+         // Allocate space on the GPU to hold the new object
          T* gpuPointer;
          GPU_ERROR_CHECK(cudaMalloc(&gpuPointer, sizeof(T)));
 

From 552786c32b9a5a224d6f81a029d2c06f8e67f09a Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 10 Oct 2019 16:00:02 -0700
Subject: [PATCH 36/58] Remove make_managed_from_factory

---
 src/chai/managed_ptr.hpp              | 203 ++++----------------------
 tests/unit/managed_ptr_unit_tests.cpp | 132 -----------------
 2 files changed, 26 insertions(+), 309 deletions(-)

diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index a0e90966..fb3999a8 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -122,10 +122,9 @@ namespace chai {
    ///
    /// This wrapper stores both host and device pointers so that polymorphism can be
    ///    used in both contexts with a single API.
-   /// The make_managed and make_managed_from_factory functions call new on both the
-   ///    host and device so that polymorphism is valid in both contexts. Simply copying
-   ///    an object to the device will not copy the vtable, so new must be called on
-   ///    the device.
+   /// The make_managed function calls new on both the host and device so that
+   ///    polymorphism is valid in both contexts. Simply copying an object to the
+   ///    device will not copy the vtable, so new must be called on the device.
    ///
    /// Usage Requirements:
    ///    Methods that can be called on both the host and device must be declared
@@ -137,28 +136,26 @@ namespace chai {
    ///       you must explicitly modify the object in both the host context and the
    ///       device context.
    ///    Raw array members of T need to be initialized correctly with a host or
-   ///       device pointer. If a ManagedArray is passed to the make_managed or
-   ///       make_managed_from_factory methods in place of a raw array, it will be
-   ///       cast to the appropriate host or device pointer when passed to T's
-   ///       constructor on the host and on the device. If it is desired that these
-   ///       host and device pointers be kept in sync, define a callback that maintains
-   ///       a copy of the ManagedArray and upon the ACTION_MOVE event calls the copy
-   ///       constructor of that ManagedArray.
+   ///       device pointer. If a ManagedArray is passed to the make_managed function
+   ///       in place of a raw array, it will be cast to the appropriate host or device
+   ///       pointer when passed to T's constructor on the host and on the device. If it
+   ///       is desired that these host and device pointers be kept in sync, define a
+   ///       callback that maintains a copy of the ManagedArray and upon the ACTION_MOVE
+   ///       event calls the copy constructor of that ManagedArray.
    ///    If a raw array is passed to make_managed, accessing that member will be
    ///       valid only in the correct context. To prevent the accidental use of that
    ///       member in the wrong context, any methods that access it should be __host__
    ///       only or __device__ only. Special care should be taken when passing raw
    ///       arrays as arguments to member functions.
    ///    The same restrictions for raw array members also apply to raw pointer members.
-   ///       A managed_ptr can be passed to the make_managed or make_managed_from_factory
-   ///       methods in place of a raw pointer, and the host constructor of T will
-   ///       be given the extracted host pointer, and likewise the device constructor
-   ///       of T will be given the extracted device pointer. It is recommended that
-   ///       a callback is defined that maintains a copy of the managed_ptr so that
-   ///       the raw pointers are not accidentally destroyed prematurely. It is also
-   ///       recommended that the callback calls the copy constructor of the managed_ptr
-   ///       on the ACTION_MOVE event so that the ACTION_MOVE event is triggered also for
-   ///       the inner managed_ptr.
+   ///       A managed_ptr can be passed to the make_managed function in place of a raw
+   ///       pointer, and the host constructor of T will be given the extracted host
+   ///       pointer, and likewise the device constructor of T will be given the
+   ///       extracted device pointer. It is recommended that a callback is defined that
+   ///       maintains a copy of the managed_ptr and frees it on the ACTION_FREE event.
+   ///       It is also recommended that the callback calls the copy constructor of the
+   ///       managed_ptr on the ACTION_MOVE event so that the ACTION_MOVE event is
+   ///       triggered also for the inner managed_ptr.
    ///    Again, if a raw pointer is passed to make_managed, accessing that member will
    ///       only be valid in the correct context. Take care when passing raw pointers
    ///       as arguments to member functions.
@@ -572,6 +569,12 @@ namespace chai {
                      ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
                      T* pointer = get(execSpace, false);
 
+                     using T_non_const = typename std::remove_const<T>::type;
+
+                     // We can use const_cast because can managed_ptr can only
+                     // be constructed with non const pointers.
+                     T_non_const* temp = const_cast<T_non_const*>(pointer);
+
                      switch (execSpace) {
                         case CPU:
                            delete pointer;
@@ -580,9 +583,9 @@ namespace chai {
                         case GPU:
                         {
                            if (pointer) {
-                              detail::destroy_on_device<<<1, 1>>>(pointer);
+                              detail::destroy_on_device<<<1, 1>>>(temp);
                               debug_cudaDeviceSynchronize();
-                              GPU_ERROR_CHECK(cudaFree((void*) pointer));
+                              GPU_ERROR_CHECK(cudaFree(temp));
                            }
 
                            break;
@@ -767,44 +770,6 @@ namespace chai {
          return cpuPointer;
       }
 
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Calls a factory method to create a new object on the host.
-      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
-      ///    are moved to the host as necessary.
-      ///
-      /// @param[in]  f    The factory method
-      /// @param[in]  args The arguments to the factory method
-      ///
-      /// @return The host pointer to the new object
-      ///
-      template <typename T,
-                typename F,
-                typename... Args>
-      CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) {
-#ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
-
-         // Set the execution space so that ManagedArrays and managed_ptrs
-         // are handled properly
-         arrayManager->setExecutionSpace(CPU);
-#endif
-
-         // Create the object on the device
-         T* cpuPointer = f(args...);
-
-#ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
-#endif
-
-         // Return the GPU pointer
-         return cpuPointer;
-      }
-
 #ifdef __CUDACC__
       ///
       /// @author Alan Dayton
@@ -860,27 +825,6 @@ namespace chai {
          new_on_device(gpuPointer, args...);
       }
 
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Creates a new object on the device by calling the given factory method.
-      ///
-      /// @param[out] gpuPointer Used to return the device pointer to the new object
-      /// @param[in]  f The factory method (must be a __device__ or __host__ __device__
-      ///                method
-      /// @param[in]  args The arguments to the factory method
-      ///
-      /// @note Cannot capture argument packs in an extended device lambda,
-      ///       so explicit kernel is needed.
-      ///
-      template <typename T,
-                typename F,
-                typename... Args>
-      __global__ void make_on_device_from_factory(T** gpuPointer, F f, Args... args)
-      {
-         *gpuPointer = f(args...);
-      }
-
       ///
       /// @author Alan Dayton
       ///
@@ -891,9 +835,7 @@ namespace chai {
       template <typename T>
       __global__ void destroy_on_device(T* gpuPointer)
       {
-         if (gpuPointer) {
-            gpuPointer->~T();
-         }
+         gpuPointer->~T();
       }
 
       ///
@@ -926,59 +868,6 @@ namespace chai {
          make_on_device<<<1, 1>>>(gpuPointer, args...);
          debug_cudaDeviceSynchronize();
 
-#ifndef CHAI_DISABLE_RM
-         // Set the execution space back to the previous value
-         arrayManager->setExecutionSpace(currentSpace);
-#endif
-
-         // Return the GPU pointer
-         return gpuPointer;
-      }
-
-      ///
-      /// @author Alan Dayton
-      ///
-      /// Calls a factory method to create a new object on the device.
-      ///
-      /// @param[in]  f    The factory method
-      /// @param[in]  args The arguments to the factory method
-      ///
-      /// @return The device pointer to the new object
-      ///
-      template <typename T,
-                typename F,
-                typename... Args>
-      CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) {
-#ifndef CHAI_DISABLE_RM
-         // Get the ArrayManager and save the current execution space
-         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
-         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
-
-         // Set the execution space so that chai::ManagedArrays and
-         // chai::managed_ptrs are handled properly
-         arrayManager->setExecutionSpace(GPU);
-#endif
-
-         // Allocate space on the GPU to hold the pointer to the new object
-         T** gpuBuffer;
-         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
-
-         // Create the object on the device
-         make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
-         debug_cudaDeviceSynchronize();
-
-         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
-         T** cpuBuffer = (T**) malloc(sizeof(T*));
-         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
-                                    cudaMemcpyDeviceToHost));
-
-         // Get the GPU pointer
-         T* gpuPointer = cpuBuffer[0];
-
-         // Free the host and device buffers
-         free(cpuBuffer);
-         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
-
 #ifndef CHAI_DISABLE_RM
          // Set the execution space back to the previous value
          arrayManager->setExecutionSpace(currentSpace);
@@ -1042,46 +931,6 @@ namespace chai {
 #endif
    }
 
-   ///
-   /// @author Alan Dayton
-   ///
-   /// Makes a managed_ptr<T>.
-   /// Factory function to create managed_ptrs.
-   ///
-   /// @param[in] f The factory function that will create the object
-   /// @param[in] args The arguments to the factory function
-   ///
-   template <typename T,
-             typename F,
-             typename... Args>
-   CHAI_HOST managed_ptr<T> make_managed_from_factory(F&& f, Args&&... args) {
-      static_assert(detail::is_invocable<F, Args...>::value,
-                    "F is not invocable with the given arguments.");
-
-      static_assert(std::is_pointer<typename std::result_of<F(Args...)>::type>::value,
-                    "F does not return a pointer.");
-
-      using R = typename std::remove_pointer<typename std::result_of<F(Args...)>::type>::type;
-
-      static_assert(std::is_convertible<R*, T*>::value,
-                    "F does not return a pointer that is convertible to T*.");
-
-#ifdef __CUDACC__
-      // Construct on the GPU first to take advantage of asynchrony
-      T* gpuPointer = detail::make_on_device_from_factory<R>(f, args...);
-#endif
-
-      // Construct on the CPU
-      T* cpuPointer = detail::make_on_host_from_factory<R>(f, args...);
-
-      // Construct and return the managed_ptr
-#ifdef __CUDACC__
-      return managed_ptr<T>({CPU, GPU}, {cpuPointer, gpuPointer});
-#else
-      return managed_ptr<T>({CPU}, {cpuPointer});
-#endif
-   }
-
    ///
    /// @author Alan Dayton
    ///
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index b8ac4ec4..281e88a5 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -760,94 +760,6 @@ GPU_TEST(managed_ptr, gpu_make_managed)
   derived.free();
 }
 
-GPU_TEST(managed_ptr, make_managed_from_factory_function)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST_DEVICE (const int value) {
-    return Factory(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-
-  EXPECT_NE(derived.get(), nullptr);
-  EXPECT_TRUE(derived);
-  EXPECT_FALSE(derived == nullptr);
-  EXPECT_FALSE(nullptr == derived);
-  EXPECT_TRUE(derived != nullptr);
-  EXPECT_TRUE(nullptr != derived);
-
-  derived.free();
-}
-
-GPU_TEST(managed_ptr, make_managed_from_factory_lambda)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST_DEVICE (const int value) {
-    return new TestDerived(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-
-  EXPECT_NE(derived.get(), nullptr);
-  EXPECT_TRUE(derived);
-  EXPECT_FALSE(derived == nullptr);
-  EXPECT_FALSE(nullptr == derived);
-  EXPECT_TRUE(derived != nullptr);
-  EXPECT_TRUE(nullptr != derived);
-
-  derived.free();
-}
-
-GPU_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST_DEVICE (const int value) {
-    return OverloadedFactory(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-
-  EXPECT_NE(derived.get(), nullptr);
-  EXPECT_TRUE(derived);
-  EXPECT_FALSE(derived == nullptr);
-  EXPECT_FALSE(nullptr == derived);
-  EXPECT_TRUE(derived != nullptr);
-  EXPECT_TRUE(nullptr != derived);
-
-  derived.free();
-}
-
-GPU_TEST(managed_ptr, make_managed_from_factory_static_member_function)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST_DEVICE (const int value) {
-    return TestBase::Factory(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-
-  EXPECT_NE(derived.get(), nullptr);
-  EXPECT_TRUE(derived);
-  EXPECT_FALSE(derived == nullptr);
-  EXPECT_FALSE(nullptr == derived);
-  EXPECT_TRUE(derived != nullptr);
-  EXPECT_TRUE(nullptr != derived);
-
-  derived.free();
-}
-
 GPU_TEST(managed_ptr, gpu_copy_constructor)
 {
   const int expectedValue = rand();
@@ -1040,47 +952,3 @@ GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
 
 #endif
 
-// Enable the following tests to ensure that proper compiler errors are given
-// for bad arguments since otherwise it is difficult to make sure the template
-// metaprogramming is correct.
-
-#if 0
-
-// Should give something like the following:
-// error: static assertion failed: F is not invocable with the given arguments.
-
-TEST(managed_ptr, bad_function_to_make_managed_from_factory_function)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST (const int value) {
-    return new TestDerived(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(expectedValue, factory);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-}
-
-#endif
-
-#if 0
-
-// Should give something like the following:
-// error: static assertion failed: F is not invocable with the given arguments.
-
-TEST(managed_ptr, bad_arguments_to_make_managed_from_factory_function)
-{
-  const int expectedValue = rand();
-
-  auto factory = [] CHAI_HOST (const int value) {
-    return new TestDerived(value);
-  };
-
-  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue, 3);
-
-  EXPECT_EQ((*derived).getValue(), expectedValue);
-}
-
-#endif
-

From cc0e571e02fe58843bcc2aa4eca7dd9cf49b6391 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 11 Oct 2019 09:11:42 -0700
Subject: [PATCH 37/58] Eliminate duplicate code

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 449 +++------------------
 1 file changed, 63 insertions(+), 386 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index 3cdaaacf..ec0ff71c 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -156,45 +156,10 @@ template <size_t N>
 __global__ void copy_kernel(ClassWithSize<N>) {}
 
 // Benchmark how long it takes to copy a class to the GPU
-void benchmark_pass_copy_to_gpu_8(benchmark::State& state)
-{
-  ClassWithSize<8> helper;
-
-  while (state.KeepRunning()) {
-    copy_kernel<<<1, 1>>>(helper);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_pass_copy_to_gpu_8);
-
-void benchmark_pass_copy_to_gpu_64(benchmark::State& state)
-{
-  ClassWithSize<64> helper;
-
-  while (state.KeepRunning()) {
-    copy_kernel<<<1, 1>>>(helper);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_pass_copy_to_gpu_64);
-
-void benchmark_pass_copy_to_gpu_512(benchmark::State& state)
-{
-  ClassWithSize<512> helper;
-
-  while (state.KeepRunning()) {
-    copy_kernel<<<1, 1>>>(helper);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_pass_copy_to_gpu_512);
-
-void benchmark_pass_copy_to_gpu_4096(benchmark::State& state)
+template <size_t N>
+static void benchmark_pass_copy_to_gpu(benchmark::State& state)
 {
-  ClassWithSize<4096> helper;
+  ClassWithSize<N> helper;
 
   while (state.KeepRunning()) {
     copy_kernel<<<1, 1>>>(helper);
@@ -202,18 +167,10 @@ void benchmark_pass_copy_to_gpu_4096(benchmark::State& state)
   }
 }
 
-BENCHMARK(benchmark_pass_copy_to_gpu_4096);
-
-void benchmark_managed_ptr_use_gpu(benchmark::State& state)
-{
-  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(2);
-
-  while (state.KeepRunning()) {
-    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper->getValue(); });
-  }
-
-  helper.free();
-}
+BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 8);
+BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 64);
+BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 4096);
 
 // Benchmark how long it takes to call placement new on the GPU
 template <size_t N>
@@ -226,103 +183,32 @@ __global__ void placement_delete_kernel(ClassWithSize<N>* address) {
    address->~ClassWithSize<N>();
 }
 
-void benchmark_placement_new_on_gpu_8(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<8>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<8>));
-    placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_placement_new_on_gpu_8);
-
-void benchmark_placement_new_on_gpu_64(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<64>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<64>));
-    placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_placement_new_on_gpu_64);
-
-void benchmark_placement_new_on_gpu_512(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<512>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<512>));
-    placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_placement_new_on_gpu_512);
-
-void benchmark_placement_new_on_gpu_4096(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<4096>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<4096>));
-    placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_placement_new_on_gpu_4096);
-
-void benchmark_placement_new_on_gpu_32768(benchmark::State& state)
+template <size_t N>
+static void benchmark_placement_new_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
-    ClassWithSize<32768>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<32768>));
+    ClassWithSize<N>* address;
+    cudaMalloc(&address, sizeof(ClassWithSize<N>));
     placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
     cudaDeviceSynchronize();
-  }
-}
 
-BENCHMARK(benchmark_placement_new_on_gpu_32768);
+    state.PauseTiming();
 
-void benchmark_placement_new_on_gpu_262144(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<262144>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<262144>));
-    placement_new_kernel<<<1, 1>>>(address);
     placement_delete_kernel<<<1, 1>>>(address);
     cudaFree(address);
     cudaDeviceSynchronize();
-  }
-}
 
-BENCHMARK(benchmark_placement_new_on_gpu_262144);
-
-void benchmark_placement_new_on_gpu_2097152(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<2097152>* address;
-    cudaMalloc(&address, sizeof(ClassWithSize<2097152>));
-    placement_new_kernel<<<1, 1>>>(address);
-    placement_delete_kernel<<<1, 1>>>(address);
-    cudaFree(address);
-    cudaDeviceSynchronize();
+    state.ResumeTiming();
   }
 }
 
-BENCHMARK(benchmark_placement_new_on_gpu_2097152);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 8);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 64);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 2097152);
 
 // Benchmark how long it takes to call new on the GPU
 template <size_t N>
@@ -335,103 +221,32 @@ __global__ void delete_kernel(ClassWithSize<N>** address) {
    delete *address;
 }
 
-void benchmark_new_on_gpu_8(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<8>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<8>*));
-    create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_8);
-
-void benchmark_new_on_gpu_64(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<64>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<64>*));
-    create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_64);
-
-void benchmark_new_on_gpu_512(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<512>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<512>*));
-    create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_512);
-
-void benchmark_new_on_gpu_4096(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<4096>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<4096>*));
-    create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_4096);
-
-void benchmark_new_on_gpu_32768(benchmark::State& state)
+template <size_t N>
+static void benchmark_new_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
-    ClassWithSize<32768>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<32768>*));
+    ClassWithSize<N>** buffer;
+    cudaMalloc(&buffer, sizeof(ClassWithSize<N>*));
     create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
     cudaDeviceSynchronize();
-  }
-}
 
-BENCHMARK(benchmark_new_on_gpu_32768);
+    state.PauseTiming();
 
-void benchmark_new_on_gpu_262144(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<262144>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<262144>*));
-    create_kernel<<<1, 1>>>(buffer);
     delete_kernel<<<1, 1>>>(buffer);
     cudaFree(buffer);
     cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_262144);
 
-void benchmark_new_on_gpu_2097152(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<2097152>** buffer;
-    cudaMalloc(&buffer, sizeof(ClassWithSize<2097152>*));
-    create_kernel<<<1, 1>>>(buffer);
-    delete_kernel<<<1, 1>>>(buffer);
-    cudaFree(buffer);
-    cudaDeviceSynchronize();
+    state.ResumeTiming();
   }
 }
 
-BENCHMARK(benchmark_new_on_gpu_2097152);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 8);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 64);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 2097152);
 
 // Benchmark current approach
 template <size_t N>
@@ -439,131 +254,35 @@ __global__ void delete_kernel_2(ClassWithSize<N>* address) {
    delete address;
 }
 
-void benchmark_new_on_gpu_and_copy_to_host_8(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<8>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<8>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<8>** cpuBuffer = (ClassWithSize<8>**) malloc(sizeof(ClassWithSize<8>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<8>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<8>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_8);
-
-void benchmark_new_on_gpu_and_copy_to_host_64(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<64>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<64>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<64>** cpuBuffer = (ClassWithSize<64>**) malloc(sizeof(ClassWithSize<64>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<64>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<64>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_64);
-
-void benchmark_new_on_gpu_and_copy_to_host_512(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<512>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<512>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<512>** cpuBuffer = (ClassWithSize<512>**) malloc(sizeof(ClassWithSize<512>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<512>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<512>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_512);
-
-void benchmark_new_on_gpu_and_copy_to_host_4096(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<4096>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<4096>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<4096>** cpuBuffer = (ClassWithSize<4096>**) malloc(sizeof(ClassWithSize<4096>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<4096>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<4096>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_4096);
-
-void benchmark_new_on_gpu_and_copy_to_host_32768(benchmark::State& state)
+template <size_t N>
+static void benchmark_new_on_gpu_and_copy_to_host(benchmark::State& state)
 {
   while (state.KeepRunning()) {
-    ClassWithSize<32768>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<32768>*));
+    ClassWithSize<N>** gpuBuffer;
+    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<N>*));
     create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<32768>** cpuBuffer = (ClassWithSize<32768>**) malloc(sizeof(ClassWithSize<32768>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<32768>*), cudaMemcpyDeviceToHost);
+    ClassWithSize<N>** cpuBuffer = (ClassWithSize<N>**) malloc(sizeof(ClassWithSize<N>*));
+    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<N>*), cudaMemcpyDeviceToHost);
     cudaFree(gpuBuffer);
-    ClassWithSize<32768>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
+    ClassWithSize<N>* gpuPointer = cpuBuffer[0];
     free(cpuBuffer);
-    cudaDeviceSynchronize();
-  }
-}
 
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_32768);
+    state.PauseTiming();
 
-void benchmark_new_on_gpu_and_copy_to_host_262144(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<262144>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<262144>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<262144>** cpuBuffer = (ClassWithSize<262144>**) malloc(sizeof(ClassWithSize<262144>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<262144>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<262144>* gpuPointer = cpuBuffer[0];
     delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
     cudaDeviceSynchronize();
-  }
-}
 
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_262144);
-
-void benchmark_new_on_gpu_and_copy_to_host_2097152(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    ClassWithSize<2097152>** gpuBuffer;
-    cudaMalloc(&gpuBuffer, sizeof(ClassWithSize<2097152>*));
-    create_kernel<<<1, 1>>>(gpuBuffer);
-    ClassWithSize<2097152>** cpuBuffer = (ClassWithSize<2097152>**) malloc(sizeof(ClassWithSize<2097152>*));
-    cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(ClassWithSize<2097152>*), cudaMemcpyDeviceToHost);
-    cudaFree(gpuBuffer);
-    ClassWithSize<2097152>* gpuPointer = cpuBuffer[0];
-    delete_kernel_2<<<1, 1>>>(gpuPointer);
-    free(cpuBuffer);
-    cudaDeviceSynchronize();
+    state.ResumeTiming();
   }
 }
 
-BENCHMARK(benchmark_new_on_gpu_and_copy_to_host_2097152);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 8);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 64);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 512);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 4096);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 32768);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 262144);
+BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 2097152);
 
 // Benchmark how long it takes to create a stack object on the GPU
 template <size_t N>
@@ -571,76 +290,34 @@ __global__ void create_on_stack_kernel() {
    (void) ClassWithSize<N>();
 }
 
-void benchmark_create_on_stack_on_gpu_8(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    create_on_stack_kernel<8><<<1, 1>>>();
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_create_on_stack_on_gpu_8);
-
-void benchmark_create_on_stack_on_gpu_64(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    create_on_stack_kernel<64><<<1, 1>>>();
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_create_on_stack_on_gpu_64);
-
-void benchmark_create_on_stack_on_gpu_512(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    create_on_stack_kernel<512><<<1, 1>>>();
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_create_on_stack_on_gpu_512);
-
-void benchmark_create_on_stack_on_gpu_4096(benchmark::State& state)
+template <size_t N>
+static void benchmark_create_on_stack_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
-    create_on_stack_kernel<4096><<<1, 1>>>();
+    create_on_stack_kernel<N><<<1, 1>>>();
     cudaDeviceSynchronize();
   }
 }
 
-BENCHMARK(benchmark_create_on_stack_on_gpu_4096);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 8);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 64);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 2097152);
 
-void benchmark_create_on_stack_on_gpu_32768(benchmark::State& state)
+void benchmark_managed_ptr_use_gpu(benchmark::State& state)
 {
-  while (state.KeepRunning()) {
-    create_on_stack_kernel<32768><<<1, 1>>>();
-    cudaDeviceSynchronize();
-  }
-}
-
-BENCHMARK(benchmark_create_on_stack_on_gpu_32768);
+  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(2);
 
-void benchmark_create_on_stack_on_gpu_262144(benchmark::State& state)
-{
   while (state.KeepRunning()) {
-    create_on_stack_kernel<262144><<<1, 1>>>();
-    cudaDeviceSynchronize();
+    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper->getValue(); });
   }
-}
-
-BENCHMARK(benchmark_create_on_stack_on_gpu_262144);
 
-void benchmark_create_on_stack_on_gpu_2097152(benchmark::State& state)
-{
-  while (state.KeepRunning()) {
-    create_on_stack_kernel<2097152><<<1, 1>>>();
-    cudaDeviceSynchronize();
-  }
+  helper.free();
 }
 
-BENCHMARK(benchmark_create_on_stack_on_gpu_2097152);
-
 BENCHMARK(benchmark_managed_ptr_use_gpu);
 
 // Curiously recurring template pattern

From 701cd76b153ca0d5b63c2e938d5f88fa19620255 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 11 Oct 2019 09:25:27 -0700
Subject: [PATCH 38/58] Add another benchmark

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index ec0ff71c..ae268f20 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -172,6 +172,36 @@ BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 64);
 BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 512);
 BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 4096);
 
+template <size_t N>
+static void benchmark_copy_to_gpu(benchmark::State& state)
+{
+  ClassWithSize<N>* cpuPointer = new ClassWithSize<N>();
+
+  while (state.KeepRunning()) {
+    ClassWithSize<N>* gpuPointer;
+    cudaMalloc(&gpuPointer, sizeof(ClassWithSize<N>));
+    cudaMemcpy(gpuPointer, cpuPointer, sizeof(ClassWithSize<N>), cudaMemcpyHostToDevice);
+    cudaDeviceSynchronize();
+
+    state.PauseTiming();
+
+    cudaFree(gpuPointer);
+    cudaDeviceSynchronize();
+
+    state.ResumeTiming();
+  }
+
+  delete cpuPointer;
+}
+
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 8);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 64);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 2097152);
+
 // Benchmark how long it takes to call placement new on the GPU
 template <size_t N>
 __global__ void placement_new_kernel(ClassWithSize<N>* address) {

From 519dc1f52f24780fab40f3aeabd9fdc71d544939 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 11 Oct 2019 09:53:49 -0700
Subject: [PATCH 39/58] Remove PauseTiming since it completely messes up the
 benchmarks

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 23 ----------------------
 1 file changed, 23 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index ae268f20..c82eb186 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -181,14 +181,8 @@ static void benchmark_copy_to_gpu(benchmark::State& state)
     ClassWithSize<N>* gpuPointer;
     cudaMalloc(&gpuPointer, sizeof(ClassWithSize<N>));
     cudaMemcpy(gpuPointer, cpuPointer, sizeof(ClassWithSize<N>), cudaMemcpyHostToDevice);
-    cudaDeviceSynchronize();
-
-    state.PauseTiming();
-
     cudaFree(gpuPointer);
     cudaDeviceSynchronize();
-
-    state.ResumeTiming();
   }
 
   delete cpuPointer;
@@ -220,15 +214,9 @@ static void benchmark_placement_new_on_gpu(benchmark::State& state)
     ClassWithSize<N>* address;
     cudaMalloc(&address, sizeof(ClassWithSize<N>));
     placement_new_kernel<<<1, 1>>>(address);
-    cudaDeviceSynchronize();
-
-    state.PauseTiming();
-
     placement_delete_kernel<<<1, 1>>>(address);
     cudaFree(address);
     cudaDeviceSynchronize();
-
-    state.ResumeTiming();
   }
 }
 
@@ -258,15 +246,9 @@ static void benchmark_new_on_gpu(benchmark::State& state)
     ClassWithSize<N>** buffer;
     cudaMalloc(&buffer, sizeof(ClassWithSize<N>*));
     create_kernel<<<1, 1>>>(buffer);
-    cudaDeviceSynchronize();
-
-    state.PauseTiming();
-
     delete_kernel<<<1, 1>>>(buffer);
     cudaFree(buffer);
     cudaDeviceSynchronize();
-
-    state.ResumeTiming();
   }
 }
 
@@ -296,13 +278,8 @@ static void benchmark_new_on_gpu_and_copy_to_host(benchmark::State& state)
     cudaFree(gpuBuffer);
     ClassWithSize<N>* gpuPointer = cpuBuffer[0];
     free(cpuBuffer);
-
-    state.PauseTiming();
-
     delete_kernel_2<<<1, 1>>>(gpuPointer);
     cudaDeviceSynchronize();
-
-    state.ResumeTiming();
   }
 }
 

From 4dad7285822e951abc5e3036b8247704f09823ca Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Fri, 11 Oct 2019 14:48:52 -0700
Subject: [PATCH 40/58] Made benchmark non-trivial

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 147 +++++++++++++++++----
 1 file changed, 119 insertions(+), 28 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index c82eb186..b47c2a48 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -51,14 +51,18 @@
 
 class Base {
    public:
-      CHAI_HOST_DEVICE virtual int getValue() const = 0;
+      CHAI_HOST_DEVICE virtual void scale(size_t numValues, int* values) = 0;
 };
 
 class Derived : public Base {
    public:
       CHAI_HOST_DEVICE Derived(int value) : Base(), m_value(value) {}
 
-      CHAI_HOST_DEVICE int getValue() const override { return m_value; }
+      CHAI_HOST_DEVICE virtual void scale(size_t numValues, int* values) override {
+         for (size_t i = 0; i < numValues; ++i) {
+            values[i] *= m_value;
+         }
+      }
 
    private:
       int m_value = -1;
@@ -67,8 +71,8 @@ class Derived : public Base {
 template <typename T>
 class BaseCRTP {
    public:
-      CHAI_HOST_DEVICE int getValue() const {
-         return static_cast<const T*>(this)->getValue();
+      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
+         return static_cast<T*>(this)->scale(numValues, values);
       }
 };
 
@@ -76,7 +80,11 @@ class DerivedCRTP : public BaseCRTP<DerivedCRTP> {
    public:
       CHAI_HOST_DEVICE DerivedCRTP(int value) : BaseCRTP<DerivedCRTP>(), m_value(value) {}
 
-      CHAI_HOST_DEVICE int getValue() const { return m_value; }
+      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
+         for (size_t i = 0; i < numValues; ++i) {
+            values[i] *= m_value;
+         }
+      }
 
    private:
       int m_value = -1;
@@ -86,7 +94,11 @@ class NoInheritance {
    public:
       CHAI_HOST_DEVICE NoInheritance(int value) : m_value(value) {}
 
-      CHAI_HOST_DEVICE int getValue() const { return m_value; }
+      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
+         for (size_t i = 0; i < numValues; ++i) {
+            values[i] *= m_value;
+         }
+      }
 
    private:
       int m_value = -1;
@@ -109,29 +121,49 @@ static void benchmark_managed_ptr_construction_and_destruction(benchmark::State&
 BENCHMARK(benchmark_managed_ptr_construction_and_destruction);
 
 // managed_ptr
-static void benchmark_managed_ptr_use_cpu(benchmark::State& state)
+static void benchmark_use_managed_ptr_cpu(benchmark::State& state)
 {
-  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(1);
+  chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
+
+  size_t numValues = 100;
+  int* values = (int*) malloc(100 * sizeof(int));
+
+  for (size_t i = 0; i < numValues; ++i) {
+     values[i] = i * i;
+  }
+
+#ifdef __CUDACC__
+  cudaDeviceSynchronize();
+#endif
 
   while (state.KeepRunning()) {
-    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+    object->scale(numValues, values);
   }
 
-  helper.free();
+  object.free();
+  cudaDeviceSynchronize();
 }
 
-BENCHMARK(benchmark_managed_ptr_use_cpu);
+BENCHMARK(benchmark_use_managed_ptr_cpu);
 
 // Curiously recurring template pattern
 static void benchmark_curiously_recurring_template_pattern_cpu(benchmark::State& state)
 {
-  BaseCRTP<DerivedCRTP>* helper = new DerivedCRTP(3);
+  BaseCRTP<DerivedCRTP>* object = new DerivedCRTP(2);
+
+  size_t numValues = 100;
+  int* values = (int*) malloc(100 * sizeof(int));
+
+  for (size_t i = 0; i < numValues; ++i) {
+     values[i] = i * i;
+  }
 
   while (state.KeepRunning()) {
-    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+    object->scale(numValues, values);
   }
 
-  delete helper;
+  free(values);
+  delete object;
 }
 
 BENCHMARK(benchmark_curiously_recurring_template_pattern_cpu);
@@ -139,13 +171,21 @@ BENCHMARK(benchmark_curiously_recurring_template_pattern_cpu);
 // Class without inheritance
 static void benchmark_no_inheritance_cpu(benchmark::State& state)
 {
-  NoInheritance* helper = new NoInheritance(5);
+  NoInheritance* object = new NoInheritance(2);
+
+  size_t numValues = 100;
+  int* values = (int*) malloc(100 * sizeof(int));
+
+  for (size_t i = 0; i < numValues; ++i) {
+     values[i] = i * i;
+  }
 
   while (state.KeepRunning()) {
-    forall(sequential(), 0, 1, [=] (int i) { (void) helper->getValue(); });
+    object->scale(numValues, values);
   }
 
-  delete helper;
+  free(values);
+  delete object;
 }
 
 BENCHMARK(benchmark_no_inheritance_cpu);
@@ -314,45 +354,96 @@ BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 32768);
 BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 262144);
 BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 2097152);
 
-void benchmark_managed_ptr_use_gpu(benchmark::State& state)
+// Use managed_ptr
+__global__ void fill(size_t numValues, int* values) {
+   size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+
+   if (i < numValues) {
+      values[i] = i * i;
+   }
+}
+
+__global__ void square(chai::managed_ptr<Base> object, size_t numValues, int* values) {
+   object->scale(numValues, values);
+}
+
+void benchmark_use_managed_ptr_gpu(benchmark::State& state)
 {
-  chai::managed_ptr<Base> helper = chai::make_managed<Derived>(2);
+  chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
+
+  size_t numValues = 100;
+  int* values;
+  cudaMalloc(&values, numValues * sizeof(int));
+  fill<<<1, 100>>>(numValues, values);
+
+  cudaDeviceSynchronize();
 
   while (state.KeepRunning()) {
-    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper->getValue(); });
+    square<<<1, 1>>>(object, numValues, values);
+    cudaDeviceSynchronize();
   }
 
-  helper.free();
+  cudaFree(values);
+  object.free();
+  cudaDeviceSynchronize();
 }
 
-BENCHMARK(benchmark_managed_ptr_use_gpu);
+BENCHMARK(benchmark_use_managed_ptr_gpu);
 
 // Curiously recurring template pattern
+__global__ void square(BaseCRTP<DerivedCRTP> object, size_t numValues, int* values) {
+   object.scale(numValues, values);
+}
+
 void benchmark_curiously_recurring_template_pattern_gpu(benchmark::State& state)
 {
-  BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(4);
-  auto helper = *derivedCRTP;
+  BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(2);
+  auto object = *derivedCRTP;
+
+  size_t numValues = 100;
+  int* values;
+  cudaMalloc(&values, numValues * sizeof(int));
+  fill<<<1, 100>>>(numValues, values);
+
+  cudaDeviceSynchronize();
 
   while (state.KeepRunning()) {
-    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
+    square<<<1, 1>>>(object, numValues, values);
+    cudaDeviceSynchronize();
   }
 
+  cudaFree(values);
   delete derivedCRTP;
+  cudaDeviceSynchronize();
 }
 
 BENCHMARK(benchmark_curiously_recurring_template_pattern_gpu);
 
 // Class without inheritance
+__global__ void square(NoInheritance object, size_t numValues, int* values) {
+   object.scale(numValues, values);
+}
+
 void benchmark_no_inheritance_gpu(benchmark::State& state)
 {
-  NoInheritance* noInheritance = new NoInheritance(5);
-  auto helper = *noInheritance;
+  NoInheritance* noInheritance = new NoInheritance(2);
+  auto object = *noInheritance;
+
+  size_t numValues = 100;
+  int* values;
+  cudaMalloc(&values, numValues * sizeof(int));
+  fill<<<1, 100>>>(numValues, values);
+
+  cudaDeviceSynchronize();
 
   while (state.KeepRunning()) {
-    forall(gpu(), 0, 1, [=] __device__ (int i) { (void) helper.getValue(); });
+    square<<<1, 1>>>(object, numValues, values);
+    cudaDeviceSynchronize();
   }
 
+  cudaFree(values);
   delete noInheritance;
+  cudaDeviceSynchronize();
 }
 
 BENCHMARK(benchmark_no_inheritance_gpu);

From ea602f1e91a1f5ead66005d874dd17dafc03e7bf Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 06:49:05 -0700
Subject: [PATCH 41/58] Switchout to use Umpire's logging macro for better log
 support

---
 src/chai/ArrayManager.cpp      |  9 ++++-----
 src/chai/ArrayManager.inl      |  2 +-
 src/chai/ChaiMacros.hpp        | 22 ++++++++++++++++++----
 src/chai/ManagedArray.inl      | 14 +++++++-------
 src/chai/ManagedArray_thin.inl | 24 ++++++++++++------------
 5 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp
index 54a32176..82dc3795 100644
--- a/src/chai/ArrayManager.cpp
+++ b/src/chai/ArrayManager.cpp
@@ -85,7 +85,7 @@ void ArrayManager::registerPointer(
    ExecutionSpace space,
    bool owned)
 {
-  CHAI_LOG("ArrayManager", "Registering " << pointer << " in space " << space);
+  CHAI_LOG(Debug, "Registering " << pointer << " in space " << space);
 
   std::lock_guard<std::mutex> lock(m_mutex);
 
@@ -113,7 +113,7 @@ void ArrayManager::deregisterPointer(PointerRecord* record)
 
 void ArrayManager::setExecutionSpace(ExecutionSpace space)
 {
-  CHAI_LOG("ArrayManager", "Setting execution space to " << space);
+  CHAI_LOG(Debug, "Setting execution space to " << space);
   std::lock_guard<std::mutex> lock(m_mutex);
 
   m_current_execution_space = space;
@@ -150,8 +150,7 @@ void ArrayManager::registerTouch(PointerRecord* pointer_record)
 void ArrayManager::registerTouch(PointerRecord* pointer_record,
                                  ExecutionSpace space)
 {
-  CHAI_LOG("ArrayManager",
-           pointer << " touched in space " << space);
+  CHAI_LOG(Debug, pointer << " touched in space " << space);
 
   if (space != NONE) {
     std::lock_guard<std::mutex> lock(m_mutex);
@@ -217,7 +216,7 @@ void ArrayManager::allocate(
 
   registerPointer(pointer_record, space);
 
-  CHAI_LOG("ArrayManager", "Allocated array at: " << ret);
+  CHAI_LOG(Debug, "Allocated array at: " << ret);
 }
 
 void ArrayManager::free(PointerRecord* pointer_record)
diff --git a/src/chai/ArrayManager.inl b/src/chai/ArrayManager.inl
index ff6c8cf9..7c160439 100644
--- a/src/chai/ArrayManager.inl
+++ b/src/chai/ArrayManager.inl
@@ -72,7 +72,7 @@ void* ArrayManager::reallocate(void* pointer, size_t elems, PointerRecord* point
 
   for (int space = CPU; space < NUM_EXECUTION_SPACES; ++space) {
     if(!pointer_record->m_owned[space]) {
-      CHAI_LOG("ArrayManager", "Cannot reallocate unowned pointer");
+      CHAI_LOG(Debug, "Cannot reallocate unowned pointer");
       return pointer_record->m_pointers[my_space];
     }
   }
diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp
index e05dac6d..ca0fd83d 100644
--- a/src/chai/ChaiMacros.hpp
+++ b/src/chai/ChaiMacros.hpp
@@ -45,6 +45,8 @@
 
 #include "chai/config.hpp"
 
+#include "umpire/util/Macros.hpp"
+
 #if defined(CHAI_ENABLE_CUDA) && defined(__CUDACC__)
 
 #define CHAI_HOST __host__
@@ -71,11 +73,23 @@
 
 #define CHAI_UNUSED_ARG(X)
 
-#ifdef DEBUG
-#define CHAI_LOG(file, msg) \
-  std::cerr << "[" << file << "] " << msg << std::endl;
+#if !defined(CHAI_DISABLE_RM)
+
+#define CHAI_LOG(level, msg) \
+  UMPIRE_LOG(level, msg);
+
+#else
+
+#if defined(DEBUG)
+
+#define CHAI_LOG(level, msg) \
+  std::cerr << "[" << __FILE__ << "] " << msg << std::endl;
+
 #else
-#define CHAI_LOG(file, msg)
+
+#define CHAI_LOG(level, msg)
+
+#endif
 #endif
 
 #endif  // CHAI_ChaiMacros_HPP
diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl
index 18f4db7a..04615406 100644
--- a/src/chai/ManagedArray.inl
+++ b/src/chai/ManagedArray.inl
@@ -197,7 +197,7 @@ CHAI_HOST ManagedArray<T> ManagedArray<T>::slice(size_t offset, size_t elems) {
   ManagedArray<T> slice(nullptr);
   slice.m_resource_manager = m_resource_manager;
   if(offset + elems > size()) {
-    CHAI_LOG("ManagedArray", "Invalid slice. No active pointer or index out of bounds");
+    CHAI_LOG(Debug, "Invalid slice. No active pointer or index out of bounds");
   } else {
     slice.m_pointer_record = m_pointer_record;
     slice.m_active_base_pointer = m_active_base_pointer;
@@ -216,7 +216,7 @@ CHAI_HOST void ManagedArray<T>::allocate(
     const UserCallback& cback) 
 {
   if(!m_is_slice) {
-    CHAI_LOG("ManagedArray", "Allocating array of size " << elems << " in space " << space);
+    CHAI_LOG(Debug, "Allocating array of size " << elems << " in space " << space);
 
     if (space == NONE) {
       space = m_resource_manager->getDefaultAllocationSpace();
@@ -231,7 +231,7 @@ CHAI_HOST void ManagedArray<T>::allocate(
     m_active_base_pointer = static_cast<T*>(m_pointer_record->m_pointers[space]);
     m_active_pointer = m_active_base_pointer; // Cannot be a slice
 
-    CHAI_LOG("ManagedArray", "m_active_ptr allocated at address: " << m_active_pointer);
+    CHAI_LOG(Debug, "m_active_ptr allocated at address: " << m_active_pointer);
   }
 }
 
@@ -240,7 +240,7 @@ CHAI_INLINE
 CHAI_HOST void ManagedArray<T>::reallocate(size_t elems)
 {
   if(!m_is_slice) {
-    CHAI_LOG("ManagedArray", "Reallocating array of size " << m_elems << " with new size" << elems);
+    CHAI_LOG(Debug, "Reallocating array of size " << m_elems << " with new size" << elems);
 
     m_elems = elems;
     m_active_base_pointer =
@@ -248,7 +248,7 @@ CHAI_HOST void ManagedArray<T>::reallocate(size_t elems)
                                                       m_pointer_record));
     m_active_pointer = m_active_base_pointer; // Cannot be a slice
 
-    CHAI_LOG("ManagedArray", "m_active_ptr reallocated at address: " << m_active_pointer);
+    CHAI_LOG(Debug, "m_active_ptr reallocated at address: " << m_active_pointer);
   }
 }
 
@@ -260,7 +260,7 @@ CHAI_HOST void ManagedArray<T>::free()
     m_resource_manager->free(m_pointer_record);
     m_pointer_record = nullptr;
   } else {
-    CHAI_LOG("ManagedArray", "Cannot free a slice!");
+    CHAI_LOG(Debug, "Cannot free a slice!");
   }
 }
 
@@ -371,7 +371,7 @@ void ManagedArray<T>::move(ExecutionSpace space)
   m_active_pointer = m_active_base_pointer + m_offset;
 
   if (!std::is_const<T>::value) {
-    CHAI_LOG("ManagedArray", "T is non-const, registering touch of pointer" << m_active_pointer);
+    CHAI_LOG(Debug, "T is non-const, registering touch of pointer" << m_active_pointer);
     m_resource_manager->registerTouch(m_pointer_record, space);
   }
 
diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 528751ee..09d09e73 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -121,7 +121,7 @@ CHAI_INLINE
 CHAI_HOST ManagedArray<T> ManagedArray<T>::slice(size_t offset, size_t elems) {
   ManagedArray<T> slice;
   if (offset + elems > size()) {
-    CHAI_LOG("ManagedArray", "Invalid slice. No active pointer or index out of bounds");
+    CHAI_LOG(Debug, "Invalid slice. No active pointer or index out of bounds");
   } else {
     slice.m_active_pointer = m_active_pointer + offset;
     slice.m_elems = elems;
@@ -137,9 +137,9 @@ CHAI_HOST void ManagedArray<T>::allocate(size_t elems,
                                          UserCallback const &) {
   if (!m_is_slice) {
     (void) space; // Quiet compiler warning when CHAI_LOG does nothing
-    CHAI_LOG("ManagedArray", "Allocating array of size " << elems
-                                                         << " in space "
-                                                         << space);
+    CHAI_LOG(Debug, "Allocating array of size " << elems
+                                                << " in space "
+                                                << space);
 
     m_elems = elems;
 
@@ -149,10 +149,10 @@ CHAI_HOST void ManagedArray<T>::allocate(size_t elems,
     m_active_pointer = static_cast<T*>(malloc(sizeof(T) * elems));
   #endif
 
-    CHAI_LOG("ManagedArray", "m_active_ptr allocated at address: " << m_active_pointer);
+    CHAI_LOG(Debug, "m_active_ptr allocated at address: " << m_active_pointer);
   }
   else {
-    CHAI_LOG("ManagedArray", "Attempted to allocate slice!");
+    CHAI_LOG(Debug, "Attempted to allocate slice!");
   }
 }
 
@@ -161,9 +161,9 @@ CHAI_INLINE
 CHAI_HOST void ManagedArray<T>::reallocate(size_t new_elems)
 {
   if (!m_is_slice) {
-    CHAI_LOG("ManagedArray", "Reallocating array of size " << m_elems
-                                                           << " with new size"
-                                                           << elems);
+    CHAI_LOG(Debug, "Reallocating array of size " << m_elems
+                                                  << " with new size"
+                                                  << elems);
 
     T* new_ptr;
 
@@ -179,10 +179,10 @@ CHAI_HOST void ManagedArray<T>::reallocate(size_t new_elems)
     m_active_pointer = new_ptr;
     m_active_base_pointer = m_active_pointer;
 
-    CHAI_LOG("ManagedArray", "m_active_ptr reallocated at address: " << m_active_pointer);
+    CHAI_LOG(Debug, "m_active_ptr reallocated at address: " << m_active_pointer);
   }
   else {
-    CHAI_LOG("ManagedArray", "Attempted to realloc slice!");
+    CHAI_LOG(Debug, "Attempted to realloc slice!");
   }
 }
 
@@ -201,7 +201,7 @@ CHAI_HOST void ManagedArray<T>::free()
     m_active_pointer = nullptr;
   }
   else {
-    CHAI_LOG("ManagedArray", "tried to free slice!");
+    CHAI_LOG(Debug, "tried to free slice!");
   }
 }
 

From a051ee6889513933309add228543c701c8b7d7bb Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 06:53:20 -0700
Subject: [PATCH 42/58] Update BLT version

---
 blt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blt b/blt
index fafdccc9..47089360 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit fafdccc9a83dc293db1c0b678f859aa8a067b296
+Subproject commit 4708936054366585478d9c5430449358a0a3eb86

From df4dcb7deb74c3387899442965b42559a2369929 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 06:58:56 -0700
Subject: [PATCH 43/58] Fixup some incorrect log messages

---
 src/chai/ArrayManager.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp
index 82dc3795..e6c55628 100644
--- a/src/chai/ArrayManager.cpp
+++ b/src/chai/ArrayManager.cpp
@@ -85,12 +85,11 @@ void ArrayManager::registerPointer(
    ExecutionSpace space,
    bool owned)
 {
-  CHAI_LOG(Debug, "Registering " << pointer << " in space " << space);
-
   std::lock_guard<std::mutex> lock(m_mutex);
-
   auto pointer = record->m_pointers[space];
 
+  CHAI_LOG(Debug, "Registering " << pointer << " in space " << space);
+
   m_pointer_map.insert(pointer, record);
   //record->m_last_space = space;
 
@@ -150,7 +149,7 @@ void ArrayManager::registerTouch(PointerRecord* pointer_record)
 void ArrayManager::registerTouch(PointerRecord* pointer_record,
                                  ExecutionSpace space)
 {
-  CHAI_LOG(Debug, pointer << " touched in space " << space);
+  CHAI_LOG(Debug, pointer_record->m_pointers[space] << " touched in space " << space);
 
   if (space != NONE) {
     std::lock_guard<std::mutex> lock(m_mutex);
@@ -216,7 +215,7 @@ void ArrayManager::allocate(
 
   registerPointer(pointer_record, space);
 
-  CHAI_LOG(Debug, "Allocated array at: " << ret);
+  CHAI_LOG(Debug, "Allocated array at: " << pointer_record->m_pointers[space]);
 }
 
 void ArrayManager::free(PointerRecord* pointer_record)

From 37f15347c98546b1a38bd5f1ebe909441dcccc92 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 06:59:09 -0700
Subject: [PATCH 44/58] Bump Umpire to v1.1.0

---
 src/tpl/umpire | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tpl/umpire b/src/tpl/umpire
index 82482fd7..3db26e6a 160000
--- a/src/tpl/umpire
+++ b/src/tpl/umpire
@@ -1 +1 @@
-Subproject commit 82482fd7450ab378db110f06f7e0302112c22c05
+Subproject commit 3db26e6a2626ee8c0cfa5c9769cfac6e33587122

From a0a897ea2480542d4fec5c25196dc1ba3fc26d2a Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 07:25:22 -0700
Subject: [PATCH 45/58] Switch to SPDX license

---
 COPYRIGHT                                   | 15 +++++++
 LICENSE                                     |  4 +-
 benchmarks/chai_arraymanager_benchmarks.cpp | 46 +++----------------
 benchmarks/chai_benchmark_utils.hpp         | 46 +++----------------
 benchmarks/chai_managedarray_benchmarks.cpp | 46 +++----------------
 docs/sphinx/conf.py                         | 49 +++------------------
 examples/chai-umpire-allocators.cpp         | 46 +++----------------
 examples/ex1.cpp                            | 46 +++----------------
 examples/example.cpp                        |  6 +++
 scripts/apply-license-info.sh               | 35 +++++++++++++++
 scripts/format-source.sh                    |  6 +++
 scripts/license.txt                         |  4 ++
 scripts/make_release_tarball.sh             | 48 +++-----------------
 scripts/travis/build_and_test.sh            |  6 +++
 scripts/travis/install_llvm.sh              |  6 +++
 scripts/update-copyright-year.sh            | 49 +++------------------
 src/chai/ArrayManager.cpp                   | 46 +++----------------
 src/chai/ArrayManager.hpp                   | 46 +++----------------
 src/chai/ChaiMacros.hpp                     | 46 +++----------------
 src/chai/ExecutionSpaces.hpp                | 46 +++----------------
 src/chai/ManagedArray.hpp                   | 46 +++----------------
 src/chai/PointerRecord.hpp                  | 46 +++----------------
 src/chai/Types.hpp                          | 46 +++----------------
 src/util/forall.hpp                         | 46 +++----------------
 tests/integration/managed_array_tests.cpp   | 48 +++-----------------
 tests/unit/array_manager_unit_tests.cpp     | 46 +++----------------
 tests/unit/managed_array_unit_tests.cpp     | 46 +++----------------
 27 files changed, 178 insertions(+), 788 deletions(-)
 create mode 100644 COPYRIGHT
 create mode 100755 scripts/apply-license-info.sh
 create mode 100644 scripts/license.txt

diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 00000000..3bc5dea5
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,15 @@
+Intellectual Property Notice
+------------------------------
+
+CHAI is licensed under the BSD 3 Clause license (LICENSE or
+https://opensource.org/licenses/BSD-3-Clause).
+
+Copyrights and patents in the CHAI project are retained by contributors.  No
+copyright assignment is required to contribute to CHAI.
+
+SPDX usage
+------------
+
+Individual files contain SPDX tags instead of the full license text.
+This enables machine processing of license information based on the SPDX
+License Identifiers that are available here: https://spdx.org/licenses/
diff --git a/LICENSE b/LICENSE
index 89442d9a..8f8fd45a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,4 @@
-BSD 3-Clause License
-
-Copyright (c) 2018, Lawrence Livermore National Security, LLC
+Copyright (c) 2016-2019, Lawrence Livermore National Security, LLC.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
diff --git a/benchmarks/chai_arraymanager_benchmarks.cpp b/benchmarks/chai_arraymanager_benchmarks.cpp
index 78430582..c58a4987 100644
--- a/benchmarks/chai_arraymanager_benchmarks.cpp
+++ b/benchmarks/chai_arraymanager_benchmarks.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include <climits>
 
 #include "benchmark/benchmark_api.h"
diff --git a/benchmarks/chai_benchmark_utils.hpp b/benchmarks/chai_benchmark_utils.hpp
index 659ff197..977f5b65 100644
--- a/benchmarks/chai_benchmark_utils.hpp
+++ b/benchmarks/chai_benchmark_utils.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_chai_benchmark_utils_HPP
 #define CHAI_chai_benchmark_utils_HPP
 
diff --git a/benchmarks/chai_managedarray_benchmarks.cpp b/benchmarks/chai_managedarray_benchmarks.cpp
index 74409725..4fcb33bf 100644
--- a/benchmarks/chai_managedarray_benchmarks.cpp
+++ b/benchmarks/chai_managedarray_benchmarks.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include <climits>
 
 #include "benchmark/benchmark_api.h"
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index e7e4a883..bd3f0ee1 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -1,48 +1,11 @@
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 # -*- coding: utf-8 -*-
 
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
 #
 # CHAI documentation build configuration file, created by
 # sphinx-quickstart on Thu Mar 30 12:14:09 2017.
diff --git a/examples/chai-umpire-allocators.cpp b/examples/chai-umpire-allocators.cpp
index 4486a56c..15fa4fa9 100644
--- a/examples/chai-umpire-allocators.cpp
+++ b/examples/chai-umpire-allocators.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "umpire/ResourceManager.hpp"
 #include "umpire/strategy/DynamicPool.hpp"
 
diff --git a/examples/ex1.cpp b/examples/ex1.cpp
index 11479716..fdec4b3b 100644
--- a/examples/ex1.cpp
+++ b/examples/ex1.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "chai/ManagedArray.hpp"
 #include "chai/util/forall.hpp"
 
diff --git a/examples/example.cpp b/examples/example.cpp
index a7a3b2d6..e3405160 100644
--- a/examples/example.cpp
+++ b/examples/example.cpp
@@ -1,3 +1,9 @@
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 // ---------------------------------------------------------------------
 // Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
 // rights reserved.
diff --git a/scripts/apply-license-info.sh b/scripts/apply-license-info.sh
new file mode 100755
index 00000000..99f7aea3
--- /dev/null
+++ b/scripts/apply-license-info.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env zsh
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
+
+setopt extended_glob
+
+RED="\033[1;31m"
+GREEN="\033[1;32m"
+NOCOLOR="\033[0m"
+
+LIC_CMD=$(which lic)
+if [ ! $LIC_CMD ]; then
+  echo "${RED} [!] This script requires the lic command."
+  exit 255
+fi
+
+echo "Applying licenses to files"
+
+files_no_license=$(grep -L 'This file is part of Umpire.' \
+  benchmarks/**/*(^/) \
+  cmake/**/*(^/) \
+  docs/**/*~*rst(^/)\
+  examples/**/*(^/) \
+  scripts/**/*(^/) \
+  src/**/*~*tpl*(^/) \
+  tests/**/*(^/) \
+  CMakeLists.txt)
+
+echo $files_no_license | xargs $LIC_CMD -f scripts/license.txt 
+
+echo "${GREEN} [Ok] License text applied. ${NOCOLOR}"
diff --git a/scripts/format-source.sh b/scripts/format-source.sh
index f14ab078..0206cfbe 100755
--- a/scripts/format-source.sh
+++ b/scripts/format-source.sh
@@ -1,3 +1,9 @@
 #!/usr/bin/env bash
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 
 find . -type f -iname '*.hpp' -o -iname '*.cpp' | grep -v -e blt -e tpl | xargs clang-format -i
diff --git a/scripts/license.txt b/scripts/license.txt
new file mode 100644
index 00000000..5b1c9027
--- /dev/null
+++ b/scripts/license.txt
@@ -0,0 +1,4 @@
+Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+project contributors. See the COPYRIGHT file for details.
+
+SPDX-License-Identifier: BSD-3-Clause
diff --git a/scripts/make_release_tarball.sh b/scripts/make_release_tarball.sh
index a8deba89..6a062604 100755
--- a/scripts/make_release_tarball.sh
+++ b/scripts/make_release_tarball.sh
@@ -1,46 +1,10 @@
 #!/bin/bash
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 
 TAR_CMD=gtar
 VERSION=1.2.0
diff --git a/scripts/travis/build_and_test.sh b/scripts/travis/build_and_test.sh
index 1dd79681..44eb570c 100755
--- a/scripts/travis/build_and_test.sh
+++ b/scripts/travis/build_and_test.sh
@@ -1,4 +1,10 @@
 #!/bin/bash
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 
 function or_die () {
     "$@"
diff --git a/scripts/travis/install_llvm.sh b/scripts/travis/install_llvm.sh
index fb6a87fb..2b6c69fd 100755
--- a/scripts/travis/install_llvm.sh
+++ b/scripts/travis/install_llvm.sh
@@ -1,3 +1,9 @@
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 # /bin/bash
 
 export LLVM_PATH=${HOME}/llvm/clang+llvm-${LLVM_VERSION}-x86_64-linux-gnu-ubuntu-14.04
diff --git a/scripts/update-copyright-year.sh b/scripts/update-copyright-year.sh
index 4bf2492e..6b411e07 100755
--- a/scripts/update-copyright-year.sh
+++ b/scripts/update-copyright-year.sh
@@ -1,47 +1,10 @@
 #!/usr/bin/env zsh
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 # This is used for the ~*tpl* line to ignore files in bundled tpls
 setopt extended_glob
 
diff --git a/src/chai/ArrayManager.cpp b/src/chai/ArrayManager.cpp
index 54a32176..5effcbc9 100644
--- a/src/chai/ArrayManager.cpp
+++ b/src/chai/ArrayManager.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2017, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "chai/ArrayManager.hpp"
 
 #include "chai/config.hpp"
diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index 82221d24..14e4c420 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ArrayManager_HPP
 #define CHAI_ArrayManager_HPP
 
diff --git a/src/chai/ChaiMacros.hpp b/src/chai/ChaiMacros.hpp
index e05dac6d..14cbaae7 100644
--- a/src/chai/ChaiMacros.hpp
+++ b/src/chai/ChaiMacros.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ChaiMacros_HPP
 #define CHAI_ChaiMacros_HPP
 
diff --git a/src/chai/ExecutionSpaces.hpp b/src/chai/ExecutionSpaces.hpp
index f75ec0ec..02f1c889 100644
--- a/src/chai/ExecutionSpaces.hpp
+++ b/src/chai/ExecutionSpaces.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ExecutionSpaces_HPP
 #define CHAI_ExecutionSpaces_HPP
 
diff --git a/src/chai/ManagedArray.hpp b/src/chai/ManagedArray.hpp
index c4593549..37d16e91 100644
--- a/src/chai/ManagedArray.hpp
+++ b/src/chai/ManagedArray.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ManagedArray_HPP
 #define CHAI_ManagedArray_HPP
 
diff --git a/src/chai/PointerRecord.hpp b/src/chai/PointerRecord.hpp
index da4f344e..9250a8e0 100644
--- a/src/chai/PointerRecord.hpp
+++ b/src/chai/PointerRecord.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_PointerRecord_HPP
 #define CHAI_PointerRecord_HPP
 
diff --git a/src/chai/Types.hpp b/src/chai/Types.hpp
index 0937e473..37a57b86 100644
--- a/src/chai/Types.hpp
+++ b/src/chai/Types.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_Types_HPP
 #define CHAI_Types_HPP
 
diff --git a/src/util/forall.hpp b/src/util/forall.hpp
index 4cb537f0..f1ae1835 100644
--- a/src/util/forall.hpp
+++ b/src/util/forall.hpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_forall_HPP
 #define CHAI_forall_HPP
 
diff --git a/tests/integration/managed_array_tests.cpp b/tests/integration/managed_array_tests.cpp
index 842293e7..506b7c70 100644
--- a/tests/integration/managed_array_tests.cpp
+++ b/tests/integration/managed_array_tests.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "gtest/gtest.h"
 
 #define GPU_TEST(X, Y)              \
@@ -1394,4 +1358,4 @@ GPU_TEST(ManagedArray, CopyZero)
 
   array.free();
 }
-#endif
\ No newline at end of file
+#endif
diff --git a/tests/unit/array_manager_unit_tests.cpp b/tests/unit/array_manager_unit_tests.cpp
index 66edf586..b50d0cce 100644
--- a/tests/unit/array_manager_unit_tests.cpp
+++ b/tests/unit/array_manager_unit_tests.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "gtest/gtest.h"
 
 #include "chai/ArrayManager.hpp"
diff --git a/tests/unit/managed_array_unit_tests.cpp b/tests/unit/managed_array_unit_tests.cpp
index f893027d..50fb1696 100644
--- a/tests/unit/managed_array_unit_tests.cpp
+++ b/tests/unit/managed_array_unit_tests.cpp
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #include "gtest/gtest.h"
 
 #define GPU_TEST(X, Y)              \

From 2e974f9d9eb1fbd85788a46d0c0874cc1580adde Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 16 Oct 2019 07:31:56 -0700
Subject: [PATCH 46/58] Finalize license style switch

---
 CMakeLists.txt                             | 49 +++------------------
 benchmarks/CMakeLists.txt                  | 49 +++------------------
 cmake/ChaiBasics.cmake                     | 49 +++------------------
 cmake/thirdparty/SetupChaiThirdparty.cmake | 49 +++------------------
 docs/CMakeLists.txt                        | 49 +++------------------
 docs/doxygen/CMakeLists.txt                | 49 +++------------------
 docs/doxygen/Doxyfile.in                   | 49 +++------------------
 docs/sphinx/CMakeLists.txt                 |  6 +++
 docs/sphinx/conf.py.in                     | 50 +++-------------------
 examples/CMakeLists.txt                    | 49 +++------------------
 src/CMakeLists.txt                         | 49 +++------------------
 src/chai/ArrayManager.inl                  | 46 +++-----------------
 src/chai/CMakeLists.txt                    | 49 +++------------------
 src/chai/ManagedArray.inl                  | 48 +++------------------
 src/chai/ManagedArray_thin.inl             | 48 +++------------------
 src/chai/chai-config.cmake.in              | 48 +++------------------
 src/chai/config.hpp.in                     | 48 +++------------------
 tests/CMakeLists.txt                       | 49 +++------------------
 tests/integration/CMakeLists.txt           |  6 +++
 tests/unit/CMakeLists.txt                  | 49 +++------------------
 20 files changed, 119 insertions(+), 769 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fefd839f..a99a649c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 cmake_policy(SET CMP0057 NEW)
 
 project(Chai LANGUAGES CXX)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 6795f860..77dbe5fc 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set (chai_benchmark_depends
   chai
   gbenchmark)
diff --git a/cmake/ChaiBasics.cmake b/cmake/ChaiBasics.cmake
index 7cae65a6..93db482b 100644
--- a/cmake/ChaiBasics.cmake
+++ b/cmake/ChaiBasics.cmake
@@ -1,46 +1,9 @@
-######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set (CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 
 if (ENABLE_HIP)
diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index ee8ffbb4..966221b9 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set(ENABLE_FORTRAN Off CACHE BOOL "Enable Fortran in Umpire")
 
 if (NOT TARGET umpire)
diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt
index de9200c0..c50020b5 100644
--- a/docs/CMakeLists.txt
+++ b/docs/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 if (DOXYGEN_FOUND)
   add_subdirectory(doxygen)
 endif ()
diff --git a/docs/doxygen/CMakeLists.txt b/docs/doxygen/CMakeLists.txt
index 1b2725a6..ac0fc239 100644
--- a/docs/doxygen/CMakeLists.txt
+++ b/docs/doxygen/CMakeLists.txt
@@ -1,44 +1,7 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 blt_add_doxygen_target(chai_doxgen)
diff --git a/docs/doxygen/Doxyfile.in b/docs/doxygen/Doxyfile.in
index 5e13e82f..1bb28a90 100644
--- a/docs/doxygen/Doxyfile.in
+++ b/docs/doxygen/Doxyfile.in
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 #---------------------------------------------------------------------------
 # Project related configuration options
 #---------------------------------------------------------------------------
diff --git a/docs/sphinx/CMakeLists.txt b/docs/sphinx/CMakeLists.txt
index 1b14348d..d9f4aed8 100644
--- a/docs/sphinx/CMakeLists.txt
+++ b/docs/sphinx/CMakeLists.txt
@@ -1,3 +1,9 @@
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 #######################################################################
 # Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
 # rights reserved.
diff --git a/docs/sphinx/conf.py.in b/docs/sphinx/conf.py.in
index 0fd73ff2..1e870de5 100644
--- a/docs/sphinx/conf.py.in
+++ b/docs/sphinx/conf.py.in
@@ -1,48 +1,10 @@
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 # -*- coding: utf-8 -*-
-
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
 #
 # CHAI documentation build configuration file, created by
 # sphinx-quickstart on Thu Mar 30 12:14:09 2017.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 5abdc2e3..3902adae 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set (chai_umpire_example_depends
   chai)
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index b38cf609..2b18fdec 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -1,44 +1,7 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 add_subdirectory(chai)
diff --git a/src/chai/ArrayManager.inl b/src/chai/ArrayManager.inl
index ff6c8cf9..c4505329 100644
--- a/src/chai/ArrayManager.inl
+++ b/src/chai/ArrayManager.inl
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
 //
-// Produced at the Lawrence Livermore National Laboratory.
-//
-// This file is part of CHAI.
-//
-// LLNL-CODE-705877
-//
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-//
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-//
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-//
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ArrayManager_INL
 #define CHAI_ArrayManager_INL
 
diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt
index 7821fa3f..365da02e 100644
--- a/src/chai/CMakeLists.txt
+++ b/src/chai/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set(CHAI_ENABLE_PICK ${ENABLE_PICK})
 set(CHAI_ENABLE_CUDA ${ENABLE_CUDA})
 set(CHAI_ENABLE_HIP ${ENABLE_HIP})
diff --git a/src/chai/ManagedArray.inl b/src/chai/ManagedArray.inl
index 18f4db7a..deb55418 100644
--- a/src/chai/ManagedArray.inl
+++ b/src/chai/ManagedArray.inl
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
-// 
-// Produced at the Lawrence Livermore National Laboratory.
-// 
-// This file is part of CHAI.
-// 
-// LLNL-CODE-705877
-// 
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-// 
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-// 
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ManagedArray_INL
 #define CHAI_ManagedArray_INL
 
diff --git a/src/chai/ManagedArray_thin.inl b/src/chai/ManagedArray_thin.inl
index 528751ee..3c2e4232 100644
--- a/src/chai/ManagedArray_thin.inl
+++ b/src/chai/ManagedArray_thin.inl
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
-// 
-// Produced at the Lawrence Livermore National Laboratory.
-// 
-// This file is part of CHAI.
-// 
-// LLNL-CODE-705877
-// 
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-// 
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-// 
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_ManagedArray_thin_INL
 #define CHAI_ManagedArray_thin_INL
 
diff --git a/src/chai/chai-config.cmake.in b/src/chai/chai-config.cmake.in
index 9413ddb7..1cc6198f 100644
--- a/src/chai/chai-config.cmake.in
+++ b/src/chai/chai-config.cmake.in
@@ -1,45 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 
 set (CHAI_INSTALL_PREFIX @CMAKE_INSTALL_PREFIX@)
 set (CHAI_INCLUDE_DIRS @CMAKE_INSTALL_PREFIX@/include)
diff --git a/src/chai/config.hpp.in b/src/chai/config.hpp.in
index a7a443fe..4c35a724 100644
--- a/src/chai/config.hpp.in
+++ b/src/chai/config.hpp.in
@@ -1,45 +1,9 @@
-// ---------------------------------------------------------------------
-// Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-// rights reserved.
-// 
-// Produced at the Lawrence Livermore National Laboratory.
-// 
-// This file is part of CHAI.
-// 
-// LLNL-CODE-705877
-// 
-// For details, see https:://github.com/LLNL/CHAI
-// Please also see the NOTICE and LICENSE files.
-// 
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions
-// are met:
-// 
-// - Redistributions of source code must retain the above copyright
-//   notice, this list of conditions and the following disclaimer.
-// 
-// - Redistributions in binary form must reproduce the above copyright
-//   notice, this list of conditions and the following disclaimer in the
-//   documentation and/or other materials provided with the
-//   distribution.
-// 
-// - Neither the name of the LLNS/LLNL nor the names of its contributors
-//   may be used to endorse or promote products derived from this
-//   software without specific prior written permission.
-// 
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-// OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-// WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-// POSSIBILITY OF SUCH DAMAGE.
-// ---------------------------------------------------------------------
+//////////////////////////////////////////////////////////////////////////////
+// Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+// project contributors. See the COPYRIGHT file for details.
+//
+// SPDX-License-Identifier: BSD-3-Clause
+//////////////////////////////////////////////////////////////////////////////
 #ifndef CHAI_config_HPP
 #define CHAI_config_HPP
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 1e638b06..cc1c8705 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 add_subdirectory(unit)
 
 add_subdirectory(integration)
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index 22cbdd04..bd12651b 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -1,3 +1,9 @@
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set (managed_array_test_depends
   chai umpire gtest)
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index fea76c55..16594bb4 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -1,46 +1,9 @@
-#######################################################################
-# Copyright (c) 2016-2018, Lawrence Livermore National Security, LLC. All
-# rights reserved.
-# 
-# Produced at the Lawrence Livermore National Laboratory.
-# 
-# This file is part of CHAI.
-# 
-# LLNL-CODE-705877
-# 
-# For details, see https:://github.com/LLNL/CHAI
-# Please also see the NOTICE and LICENSE files.
-# 
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-# 
-# - Redistributions of source code must retain the above copyright
-#   notice, this list of conditions and the following disclaimer.
-# 
-# - Redistributions in binary form must reproduce the above copyright
-#   notice, this list of conditions and the following disclaimer in the
-#   documentation and/or other materials provided with the
-#   distribution.
-# 
-# - Neither the name of the LLNS/LLNL nor the names of its contributors
-#   may be used to endorse or promote products derived from this
-#   software without specific prior written permission.
-# 
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
-# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
-# AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
-# WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-# POSSIBILITY OF SUCH DAMAGE.
-#######################################################################
-
+##############################################################################
+# Copyright (c) 2016-19, Lawrence Livermore National Security, LLC and CHAI
+# project contributors. See the COPYRIGHT file for details.
+#
+# SPDX-License-Identifier: BSD-3-Clause
+##############################################################################
 set (managed_array_test_depends
   chai umpire gtest)
 

From 95fe093888a66353984efca82efd9f7d46426108 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 23 Oct 2019 19:15:11 -0700
Subject: [PATCH 47/58] RAJA no longer requires explicitly finding camp
 dependency

---
 cmake/thirdparty/SetupChaiThirdparty.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index a2cd708e..68526bf5 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -25,7 +25,6 @@ blt_register_library(
   LIBRARIES umpire)
 
 if (ENABLE_RAJA_PLUGIN)
-  find_package(camp REQUIRED)
   find_package(RAJA REQUIRED)
   
   blt_register_library(

From ca638167b953831966f49832bbcf585b82174fcf Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 24 Oct 2019 12:01:39 -0700
Subject: [PATCH 48/58] Disable RAJA plugin by default

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06a19dfe..c519e53a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,7 +16,7 @@ option(ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw poin
 option(DISABLE_RM "Make ManagedArray a thin wrapper" Off)
 mark_as_advanced(DISABLE_RM)
 option(ENABLE_UM "Use CUDA unified (managed) memory" Off)
-option(ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" On)
+option(ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" Off)
 
 set(ENABLE_TESTS On CACHE BOOL "")
 set(ENABLE_EXAMPLES On CACHE BOOL "")

From d4dcc1a90813ae69f9a8c76c831955be4fed8804 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 7 Nov 2019 09:35:50 -0800
Subject: [PATCH 49/58] Updates to bring in RAJA as a submodule

---
 .gitmodules                                |  3 +++
 CMakeLists.txt                             |  2 +-
 blt                                        |  2 +-
 cmake/thirdparty/SetupChaiThirdparty.cmake | 29 +++++++++++-----------
 src/chai/ArrayManager.hpp                  |  3 +++
 src/chai/CMakeLists.txt                    |  4 +--
 src/tpl/raja                               |  1 +
 tests/integration/CMakeLists.txt           |  2 +-
 8 files changed, 26 insertions(+), 20 deletions(-)
 create mode 160000 src/tpl/raja

diff --git a/.gitmodules b/.gitmodules
index 8564a4ae..d77cfd4e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,6 @@
 [submodule "src/tpl/umpire"]
 	path = src/tpl/umpire
 	url = https://github.com/LLNL/Umpire.git
+[submodule "src/tpl/raja"]
+	path = src/tpl/raja
+	url = https://github.com/LLNL/RAJA.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c519e53a..fa0cb77b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,7 +20,7 @@ option(ENABLE_RAJA_PLUGIN "Build plugin to set RAJA execution spaces" Off)
 
 set(ENABLE_TESTS On CACHE BOOL "")
 set(ENABLE_EXAMPLES On CACHE BOOL "")
-set(ENABLE_DOCUMENTATION On CACHE BOOL "")
+set(ENABLE_DOCS Off CACHE BOOL "")
 
 # options for Umpire as TPL
 set(ENABLE_GMOCK On CACHE BOOL "")
diff --git a/blt b/blt
index 47089360..30ccea5a 160000
--- a/blt
+++ b/blt
@@ -1 +1 @@
-Subproject commit 4708936054366585478d9c5430449358a0a3eb86
+Subproject commit 30ccea5ad9853bd6397d8c67deed88b55916d2be
diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index 68526bf5..c3f44c3d 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -4,8 +4,6 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 ##############################################################################
-set(ENABLE_FORTRAN Off CACHE BOOL "Enable Fortran in Umpire")
-
 if (NOT TARGET umpire)
   if (DEFINED umpire_DIR)
     find_package(umpire REQUIRED)
@@ -15,22 +13,23 @@ if (NOT TARGET umpire)
       INCLUDES ${UMPIRE_INCLUDE_DIRS}
       LIBRARIES umpire)
   else ()
+    set(OLD_ENABLE_FORTRAN ${ENABLE_FORTRAN})
+    set(ENABLE_FORTRAN Off CACHE BOOL "Enable Fortran in Umpire")
     add_subdirectory(${PROJECT_SOURCE_DIR}/src/tpl/umpire)
+    set(ENABLE_FORTRAN ${OLD_ENABLE_FORTRAN})
   endif()
 endif()
 
-blt_register_library(
-  NAME umpire
-  INCLUDES ${UMPIRE_INCLUDE_DIRS}
-  LIBRARIES umpire)
-
 if (ENABLE_RAJA_PLUGIN)
-  find_package(RAJA REQUIRED)
-  
-  blt_register_library(
-    NAME raja
-    INCLUDES ${RAJA_INCLUDE_DIR}
-    LIBRARIES RAJA)
-
-  message(STATUS "RAJA: ${RAJA_INCLUDE_DIR}")
+  if (NOT TARGET RAJA)
+    if (DEFINED raja_DIR)
+      message(STATUS "CHAI: using external RAJA via find_package")
+      find_package(RAJA REQUIRED)
+    else()
+      message(STATUS "CHAI: using builtin RAJA submodule")
+      add_subdirectory(${PROJECT_SOURCE_DIR}/src/tpl/raja)
+    endif()
+  else()
+    message(STATUS "CHAI: using existing RAJA target")
+  endif()
 endif ()
diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index 10702429..beb38205 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -7,12 +7,15 @@
 #ifndef CHAI_ArrayManager_HPP
 #define CHAI_ArrayManager_HPP
 
+#include "chai/config.hpp"
 #include "chai/ChaiMacros.hpp"
 #include "chai/ExecutionSpaces.hpp"
 #include "chai/PointerRecord.hpp"
 #include "chai/Types.hpp"
 
+#if defined(CHAI_ENABLE_RAJA_PLUGIN)
 #include "chai/pluginLinker.hpp"
+#endif
 
 #include <unordered_map>
 
diff --git a/src/chai/CMakeLists.txt b/src/chai/CMakeLists.txt
index bf5caf4c..8a4966d7 100644
--- a/src/chai/CMakeLists.txt
+++ b/src/chai/CMakeLists.txt
@@ -20,7 +20,6 @@ set (chai_headers
   ArrayManager.hpp
   ArrayManager.inl
   ChaiMacros.hpp
-  pluginLinker.hpp
   ExecutionSpaces.hpp
   ManagedArray.hpp
   ManagedArray.inl
@@ -53,6 +52,7 @@ endif ()
 if (ENABLE_RAJA_PLUGIN)
   set (chai_headers
     ${chai_headers}
+    pluginLinker.hpp
     ManagedArrayView.hpp
     RajaExecutionSpacePlugin.hpp)
 
@@ -62,7 +62,7 @@ if (ENABLE_RAJA_PLUGIN)
 
   set (chai_depends
     ${chai_depends}
-    raja)
+    RAJA)
 endif ()
 
 blt_add_library(
diff --git a/src/tpl/raja b/src/tpl/raja
new file mode 160000
index 00000000..53cb89cf
--- /dev/null
+++ b/src/tpl/raja
@@ -0,0 +1 @@
+Subproject commit 53cb89cf788d28bc4ed2b4e6f75483fdd26024aa
diff --git a/tests/integration/CMakeLists.txt b/tests/integration/CMakeLists.txt
index fc9da2df..0f1bc586 100644
--- a/tests/integration/CMakeLists.txt
+++ b/tests/integration/CMakeLists.txt
@@ -41,7 +41,7 @@ blt_add_test(
 if (ENABLE_RAJA_PLUGIN)
   set(raja_test_depends
     ${managed_array_test_depends}
-    raja)
+    RAJA)
 
   blt_add_executable(
     NAME raja-chai-tests 

From 4de461caebc0db66411a2768f48a7372d42c8937 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Thu, 7 Nov 2019 09:49:27 -0800
Subject: [PATCH 50/58] Tweak for external RAJA

---
 cmake/thirdparty/SetupChaiThirdparty.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/thirdparty/SetupChaiThirdparty.cmake b/cmake/thirdparty/SetupChaiThirdparty.cmake
index c3f44c3d..21a63f9a 100644
--- a/cmake/thirdparty/SetupChaiThirdparty.cmake
+++ b/cmake/thirdparty/SetupChaiThirdparty.cmake
@@ -22,7 +22,7 @@ endif()
 
 if (ENABLE_RAJA_PLUGIN)
   if (NOT TARGET RAJA)
-    if (DEFINED raja_DIR)
+    if (DEFINED RAJA_DIR)
       message(STATUS "CHAI: using external RAJA via find_package")
       find_package(RAJA REQUIRED)
     else()

From 66baa9f5c335a0ada819208161840e173002e616 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 13 Nov 2019 15:29:19 -0800
Subject: [PATCH 51/58] Bump version number

---
 CMakeLists.txt         | 2 +-
 README.md              | 2 +-
 docs/sphinx/conf.py    | 4 ++--
 docs/sphinx/conf.py.in | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa0cb77b..7e01caa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@
 ##############################################################################
 cmake_policy(SET CMP0057 NEW)
 
-project(Chai LANGUAGES CXX)
+project(Chai LANGUAGES CXX VERSION 2.0.0)
 
 set(ENABLE_CUDA Off CACHE BOOL "Enable CUDA")
 set(ENABLE_HIP Off CACHE BOOL "Enable HIP")
diff --git a/README.md b/README.md
index b45a98c6..93a62f23 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# CHAI v1.2
+# CHAI v2.0
 
 [![Azure Build Status](https://dev.azure.com/davidbeckingsale/CHAI/_apis/build/status/LLNL.CHAI?branchName=develop)](https://dev.azure.com/davidbeckingsale/CHAI/_build/latest?definitionId=2&branchName=develop)
 [![Build Status](https://travis-ci.org/LLNL/CHAI.svg?branch=develop)](https://travis-ci.org/LLNL/CHAI)
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index bd3f0ee1..cdf48d8b 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -61,9 +61,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'1.2'
+version = u'2.0'
 # The full version, including alpha/beta/rc tags.
-release = u'1.2.0'
+release = u'2.0.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/sphinx/conf.py.in b/docs/sphinx/conf.py.in
index 1e870de5..8e1585b0 100644
--- a/docs/sphinx/conf.py.in
+++ b/docs/sphinx/conf.py.in
@@ -60,9 +60,9 @@ author = u''
 # built documents.
 #
 # The short X.Y version.
-version = u'1.2'
+version = u'2.0'
 # The full version, including alpha/beta/rc tags.
-release = u'1.2.0'
+release = u'2.0.0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

From 9b0ee0ecc571b2ae93b57d2a3e4493e0d33343b7 Mon Sep 17 00:00:00 2001
From: David Beckingsale <davidbeckingsale@gmail.com>
Date: Wed, 13 Nov 2019 15:38:54 -0800
Subject: [PATCH 52/58] Set CMP0048 to NEW to enable VERSION in project command
 (cmake)

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e01caa4..a9e5a6c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,7 @@
 # SPDX-License-Identifier: BSD-3-Clause
 ##############################################################################
 cmake_policy(SET CMP0057 NEW)
+cmake_policy(SET CMP0048 NEW)
 
 project(Chai LANGUAGES CXX VERSION 2.0.0)
 

From 56f58fe9138372f54175a00aa852751d8c5196b8 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 12:45:28 -0800
Subject: [PATCH 53/58] More stable implementation

---
 CMakeLists.txt                          |   1 +
 src/chai/managed_ptr.hpp                | 348 ++++++++++++++++++------
 tests/integration/managed_ptr_tests.cpp | 176 +++++++++---
 tests/unit/managed_ptr_unit_tests.cpp   | 258 +++++++++++++++---
 4 files changed, 618 insertions(+), 165 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fefd839f..b7f27440 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,6 +53,7 @@ option(ENABLE_IMPLICIT_CONVERSIONS "Enable implicit conversions to-from raw poin
 option(DISABLE_RM "Make ManagedArray a thin wrapper" Off)
 mark_as_advanced(DISABLE_RM)
 option(ENABLE_UM "Use CUDA unified (managed) memory" Off)
+option(CHAI_ENABLE_GPU_ERROR_CHECKING "Enable GPU error checking" On)
 
 set(ENABLE_TESTS On CACHE BOOL "")
 set(ENABLE_EXAMPLES On CACHE BOOL "")
diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index fb3999a8..49ffa0e8 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -61,6 +61,8 @@
 
 #ifdef __CUDACC__
 
+#ifdef CHAI_ENABLE_GPU_ERROR_CHECKING
+
 inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abort=true)
 {
    if (code != cudaSuccess) {
@@ -71,17 +73,10 @@ inline void gpuErrorCheck(cudaError_t code, const char *file, int line, bool abo
    }
 }
 
-#if DEBUG
 #define GPU_ERROR_CHECK(code) { gpuErrorCheck((code), __FILE__, __LINE__); }
-#else
+#else // CHAI_ENABLE_GPU_ERROR_CHECKING
 #define GPU_ERROR_CHECK(code) code
-#endif
-
-inline void debug_cudaDeviceSynchronize() {
-#if DEBUG
-   GPU_ERROR_CHECK(cudaDeviceSynchronize());
-#endif
-}
+#endif // CHAI_ENABLE_GPU_ERROR_CHECKING
 
 #endif // __CUDACC__
 
@@ -94,10 +89,7 @@ namespace chai {
    }
 
    struct managed_ptr_record {
-      managed_ptr_record() :
-         m_callback()
-      {
-      }
+      managed_ptr_record() = default;
 
       managed_ptr_record(std::function<bool(Action, ExecutionSpace, void*)> callback) :
          m_callback(callback)
@@ -122,13 +114,14 @@ namespace chai {
    ///
    /// This wrapper stores both host and device pointers so that polymorphism can be
    ///    used in both contexts with a single API.
-   /// The make_managed function calls new on both the host and device so that
-   ///    polymorphism is valid in both contexts. Simply copying an object to the
-   ///    device will not copy the vtable, so new must be called on the device.
+   /// The make_managed and make_managed_from_factory functions call new on both the
+   ///    host and device so that polymorphism is valid in both contexts. Simply copying
+   ///    an object to the device will not copy the vtable, so new must be called on
+   ///    the device.
    ///
    /// Usage Requirements:
-   ///    Methods that can be called on both the host and device must be declared
-   ///       with the __host__ __device__ specifiers. This includes constructors
+   ///    Methods that can be called on the host and/or device must be declared
+   ///       with the __host__ and/or __device__ specifiers. This includes constructors
    ///       and destructors. Furthermore, destructors of base and child classes
    ///       must all be declared virtual.
    ///    This wrapper does NOT automatically sync the device object if the host object
@@ -136,37 +129,39 @@ namespace chai {
    ///       you must explicitly modify the object in both the host context and the
    ///       device context.
    ///    Raw array members of T need to be initialized correctly with a host or
-   ///       device pointer. If a ManagedArray is passed to the make_managed function
-   ///       in place of a raw array, it will be cast to the appropriate host or device
-   ///       pointer when passed to T's constructor on the host and on the device. If it
-   ///       is desired that these host and device pointers be kept in sync, define a
-   ///       callback that maintains a copy of the ManagedArray and upon the ACTION_MOVE
-   ///       event calls the copy constructor of that ManagedArray.
+   ///       device array. If a ManagedArray is passed to the make_managed or
+   ///       make_managed_from_factory methods in place of a raw array, it will be
+   ///       cast to the appropriate host or device pointer when passed to T's
+   ///       constructor on the host and on the device. If it is desired that these
+   ///       host and device pointers be kept in sync, define a callback that maintains
+   ///       a copy of the ManagedArray and upon the ACTION_MOVE event calls the copy
+   ///       constructor of that ManagedArray.
    ///    If a raw array is passed to make_managed, accessing that member will be
    ///       valid only in the correct context. To prevent the accidental use of that
    ///       member in the wrong context, any methods that access it should be __host__
    ///       only or __device__ only. Special care should be taken when passing raw
    ///       arrays as arguments to member functions.
    ///    The same restrictions for raw array members also apply to raw pointer members.
-   ///       A managed_ptr can be passed to the make_managed function in place of a raw
-   ///       pointer, and the host constructor of T will be given the extracted host
-   ///       pointer, and likewise the device constructor of T will be given the
-   ///       extracted device pointer. It is recommended that a callback is defined that
-   ///       maintains a copy of the managed_ptr and frees it on the ACTION_FREE event.
-   ///       It is also recommended that the callback calls the copy constructor of the
-   ///       managed_ptr on the ACTION_MOVE event so that the ACTION_MOVE event is
-   ///       triggered also for the inner managed_ptr.
+   ///       A managed_ptr can be passed to the make_managed or make_managed_from_factory
+   ///       methods in place of a raw pointer, and the host constructor of T will
+   ///       be given the extracted host pointer, and likewise the device constructor
+   ///       of T will be given the extracted device pointer. If it is desired that these
+   ///       host and device pointers be kept in sync, define a callback that maintains
+   ///       a copy of the managed_ptr and upon the ACTION_MOVE event calls the copy
+   ///       constructor of that managed_ptr.
    ///    Again, if a raw pointer is passed to make_managed, accessing that member will
    ///       only be valid in the correct context. Take care when passing raw pointers
    ///       as arguments to member functions.
-   ///    Be aware that only the debug version of CHAI will check for GPU errors. So
-   ///       if you are seeing strange behavior and/or your code crashes in the
-   ///       constructor/destructor of T, then build CHAI as debug to see what is
-   ///       going on. For example, the constructor of T might run out of per-thread
-   ///       stack space on the GPU. If that happens, you can increase the device
-   ///       limit of per-thread stack space. Alternatively, you could add a call
-   ///       to cudaDeviceSynchronize after calling make_managed and check the return
-   ///       code of cudaDeviceSynchronize.
+   ///    Be aware that CHAI checks every CUDA API call for GPU errors by default. To
+   ///       turn off GPU error checking, pass -DCHAI_ENABLE_GPU_ERROR_CHECKING=OFF as
+   ///       an argument to cmake when building CHAI. To turn on synchronization after
+   ///       every kernel, call ArrayManager::getInstance()->enableDeviceSynchronize().
+   ///       Alternatively, call cudaDeviceSynchronize() after any call to make_managed,
+   ///       make_managed_from_factory, or managed_ptr::free, and check the return code
+   ///       for errors. If your code crashes in the constructor/destructor of T, then it
+   ///       is recommended to turn on this synchronization. For example, the constructor
+   ///       of T might run out of per-thread stack space on the GPU. If that happens,
+   ///       you can increase the device limit of per-thread stack space.
    ///
    template <typename T>
    class managed_ptr {
@@ -178,7 +173,7 @@ namespace chai {
          ///
          /// Default constructor.
          ///
-         CHAI_HOST_DEVICE constexpr managed_ptr() noexcept {}
+         CHAI_HOST_DEVICE constexpr managed_ptr() noexcept = default;
 
          ///
          /// @author Alan Dayton
@@ -286,8 +281,9 @@ namespace chai {
          ///
          /// Copy constructor.
          /// Constructs a copy of the given managed_ptr and if the execution space is
-         ///    different, calls the user defined callback with ACTION_MOVE for each
-         ///    of the execution spaces.
+         ///    different from the last space the given managed_ptr was used in, calls
+         ///    the user defined callback with ACTION_MOVE for each of the execution
+         ///    spaces.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -306,8 +302,9 @@ namespace chai {
          ///
          /// Converting constructor.
          /// Constructs a copy of the given managed_ptr and if the execution space is
-         ///    different, calls the user defined callback with ACTION_MOVE for each
-         ///    of the execution spaces. U* must be convertible to T*.
+         ///    different from the last space the given managed_ptr was used in, calls
+         ///    the user defined callback with ACTION_MOVE for each of the execution
+         ///    spaces. U* must be convertible to T*.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -373,14 +370,10 @@ namespace chai {
          ///
          /// @author Alan Dayton
          ///
-         /// Destructor
-         ///
-         CHAI_HOST_DEVICE ~managed_ptr() {}
-
-         ///
-         /// @author Alan Dayton
-         ///
-         /// Copy assignment operator. Does a shallow copy.
+         /// Copy assignment operator.
+         /// Copies the given managed_ptr and if the execution space is different from
+         ///    the last space the given managed_ptr was used in, calls the user defined
+         ///    callback with ACTION_MOVE for each of the execution spaces.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -402,8 +395,10 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Conversion copy assignment operator.
-         /// Copies the given managed_ptr. Does a shallow copy. U* must be convertible
-         ///    to T*.
+         /// Copies the given managed_ptr and if the execution space is different from
+         ///    the last space the given managed_ptr was used in, calls the user defined
+         ///    callback with ACTION_MOVE for each of the execution spaces. U* must be
+         ///    convertible to T*.
          ///
          /// @param[in] other The managed_ptr to copy
          ///
@@ -501,10 +496,17 @@ namespace chai {
          /// @author Alan Dayton
          ///
          /// Sets the callback, which can be used to handle specific actions.
-         /// ACTION_MOVE can be used to call the copy constructor for ManagedArrays.
-         /// ACTION_FREE can be used to provide a custom deleter operation. Use
-         ///    ExecutionSpace::NONE if freeing anything other than the actual object
-         ///    pointers.
+         /// The copy constructors and copy assignment operators call the callback with
+         ///    ACTION_MOVE if the execution space has changed since the managed_ptr was
+         ///    last used. A common use case for this is to call the copy constructor
+         ///    of class members that are ManagedArrays to trigger data movement. The
+         ///    free method calls the user provided callback with ACTION_FREE in each of
+         ///    the execution spaces with the pointers from each space. This can be used
+         ///    to provide a custom deleter operation. If freeing anything other than the
+         ///    actual object pointers, do that when the ExecutionSpace is NONE. The
+         ///    callback should return true if the event has been handled (i.e. if a
+         ///    callback is provided that only cleans up the device pointer, it should
+         ///    return true in that case and false in every other case).
          ///
          /// @param[in] callback The callback to call when certain actions occur
          ///
@@ -520,8 +522,10 @@ namespace chai {
          ///
          /// @author Alan Dayton
          ///
-         /// If a user callback is provided, calls the callback with the ACTION_FREE
-         ///    event. Otherwise calls delete on the CPU and GPU pointers.
+         /// If a user defined callback has been provided, calls it with the ACTION_FREE
+         ///    event in each execution space. If the callback does not handle an event
+         ///    or a callback is not provided, this method calls delete on the host
+         ///    and device pointers.
          ///
          CHAI_HOST void free() {
             if (m_pointer_record) {
@@ -550,8 +554,12 @@ namespace chai {
                            {
                               if (pointer) {
                                  detail::destroy_on_device<<<1, 1>>>(temp);
-                                 debug_cudaDeviceSynchronize();
-                                 GPU_ERROR_CHECK(cudaFree(temp));
+
+#ifndef CHAI_DISABLE_RM
+                                 if (ArrayManager::getInstance()->deviceSynchronize()) {
+                                    GPU_ERROR_CHECK(cudaDeviceSynchronize());
+                                 }
+#endif
                               }
 
                               break;
@@ -569,12 +577,6 @@ namespace chai {
                      ExecutionSpace execSpace = static_cast<ExecutionSpace>(space);
                      T* pointer = get(execSpace, false);
 
-                     using T_non_const = typename std::remove_const<T>::type;
-
-                     // We can use const_cast because can managed_ptr can only
-                     // be constructed with non const pointers.
-                     T_non_const* temp = const_cast<T_non_const*>(pointer);
-
                      switch (execSpace) {
                         case CPU:
                            delete pointer;
@@ -583,9 +585,13 @@ namespace chai {
                         case GPU:
                         {
                            if (pointer) {
-                              detail::destroy_on_device<<<1, 1>>>(temp);
-                              debug_cudaDeviceSynchronize();
-                              GPU_ERROR_CHECK(cudaFree(temp));
+                              detail::destroy_on_device<<<1, 1>>>(pointer);
+
+#ifndef CHAI_DISABLE_RM
+                              if (ArrayManager::getInstance()->deviceSynchronize()) {
+                                 GPU_ERROR_CHECK(cudaDeviceSynchronize());
+                              }
+#endif
                            }
 
                            break;
@@ -770,6 +776,44 @@ namespace chai {
          return cpuPointer;
       }
 
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Calls a factory method to create a new object on the host.
+      /// Sets the execution space to the CPU so that ManagedArrays and managed_ptrs
+      ///    are moved to the host as necessary.
+      ///
+      /// @param[in]  f    The factory method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @return The host pointer to the new object
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      CHAI_HOST T* make_on_host_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that ManagedArrays and managed_ptrs
+         // are handled properly
+         arrayManager->setExecutionSpace(CPU);
+#endif
+
+         // Create the object on the device
+         T* cpuPointer = f(args...);
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the GPU pointer
+         return cpuPointer;
+      }
+
 #ifdef __CUDACC__
       ///
       /// @author Alan Dayton
@@ -785,8 +829,8 @@ namespace chai {
       template <typename T,
                 typename... Args,
                 typename std::enable_if<std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T* gpuPointer, Args&&... args) {
-         new(gpuPointer) T(args...);
+      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
+         *gpuPointer = new T(args...);
       }
 
       ///
@@ -803,8 +847,8 @@ namespace chai {
       template <typename T,
                 typename... Args,
                 typename std::enable_if<!std::is_constructible<T, Args...>::value, int>::type = 0>
-      CHAI_DEVICE void new_on_device(T* gpuPointer, Args&&... args) {
-         new(gpuPointer) T(getRawPointers(args)...);
+      CHAI_DEVICE void new_on_device(T** gpuPointer, Args&&... args) {
+         *gpuPointer = new T(getRawPointers(args)...);
       }
 
       ///
@@ -820,11 +864,32 @@ namespace chai {
       ///
       template <typename T,
                 typename... Args>
-      __global__ void make_on_device(T* gpuPointer, Args... args)
+      __global__ void make_on_device(T** gpuPointer, Args... args)
       {
          new_on_device(gpuPointer, args...);
       }
 
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Creates a new object on the device by calling the given factory method.
+      ///
+      /// @param[out] gpuPointer Used to return the device pointer to the new object
+      /// @param[in]  f The factory method (must be a __device__ or __host__ __device__
+      ///                method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @note Cannot capture argument packs in an extended device lambda,
+      ///       so explicit kernel is needed.
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      __global__ void make_on_device_from_factory(T** gpuPointer, F f, Args... args)
+      {
+         *gpuPointer = f(args...);
+      }
+
       ///
       /// @author Alan Dayton
       ///
@@ -835,7 +900,9 @@ namespace chai {
       template <typename T>
       __global__ void destroy_on_device(T* gpuPointer)
       {
-         gpuPointer->~T();
+         if (gpuPointer) {
+            delete gpuPointer;
+         }
       }
 
       ///
@@ -860,13 +927,88 @@ namespace chai {
          arrayManager->setExecutionSpace(GPU);
 #endif
 
-         // Allocate space on the GPU to hold the new object
-         T* gpuPointer;
-         GPU_ERROR_CHECK(cudaMalloc(&gpuPointer, sizeof(T)));
+         // Allocate space on the GPU to hold the pointer to the new object
+         T** gpuBuffer;
+         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
+
+         // Create the object on the device
+         make_on_device<<<1, 1>>>(gpuBuffer, args...);
+
+#ifndef CHAI_DISABLE_RM
+         if (ArrayManager::getInstance()->deviceSynchronize()) {
+            GPU_ERROR_CHECK(cudaDeviceSynchronize());
+         }
+#endif
+
+         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+         T** cpuBuffer = (T**) malloc(sizeof(T*));
+         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
+                                    cudaMemcpyDeviceToHost));
+
+         // Get the GPU pointer
+         T* gpuPointer = cpuBuffer[0];
+
+         // Free the host and device buffers
+         free(cpuBuffer);
+         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
+
+#ifndef CHAI_DISABLE_RM
+         // Set the execution space back to the previous value
+         arrayManager->setExecutionSpace(currentSpace);
+#endif
+
+         // Return the GPU pointer
+         return gpuPointer;
+      }
+
+      ///
+      /// @author Alan Dayton
+      ///
+      /// Calls a factory method to create a new object on the device.
+      ///
+      /// @param[in]  f    The factory method
+      /// @param[in]  args The arguments to the factory method
+      ///
+      /// @return The device pointer to the new object
+      ///
+      template <typename T,
+                typename F,
+                typename... Args>
+      CHAI_HOST T* make_on_device_from_factory(F f, Args&&... args) {
+#ifndef CHAI_DISABLE_RM
+         // Get the ArrayManager and save the current execution space
+         chai::ArrayManager* arrayManager = chai::ArrayManager::getInstance();
+         ExecutionSpace currentSpace = arrayManager->getExecutionSpace();
+
+         // Set the execution space so that chai::ManagedArrays and
+         // chai::managed_ptrs are handled properly
+         arrayManager->setExecutionSpace(GPU);
+#endif
+
+         // Allocate space on the GPU to hold the pointer to the new object
+         T** gpuBuffer;
+         GPU_ERROR_CHECK(cudaMalloc(&gpuBuffer, sizeof(T*)));
 
          // Create the object on the device
-         make_on_device<<<1, 1>>>(gpuPointer, args...);
-         debug_cudaDeviceSynchronize();
+         make_on_device_from_factory<T><<<1, 1>>>(gpuBuffer, f, args...);
+
+#ifndef CHAI_DISABLE_RM
+         if (ArrayManager::getInstance()->deviceSynchronize()) {
+            GPU_ERROR_CHECK(cudaDeviceSynchronize());
+         }
+#endif
+
+         // Allocate space on the CPU for the pointer and copy the pointer to the CPU
+         T** cpuBuffer = (T**) malloc(sizeof(T*));
+         GPU_ERROR_CHECK(cudaMemcpy(cpuBuffer, gpuBuffer, sizeof(T*),
+                                    cudaMemcpyDeviceToHost));
+
+         // Get the GPU pointer
+         T* gpuPointer = cpuBuffer[0];
+
+         // Free the host and device buffers
+         free(cpuBuffer);
+         GPU_ERROR_CHECK(cudaFree(gpuBuffer));
 
 #ifndef CHAI_DISABLE_RM
          // Set the execution space back to the previous value
@@ -931,6 +1073,46 @@ namespace chai {
 #endif
    }
 
+   ///
+   /// @author Alan Dayton
+   ///
+   /// Makes a managed_ptr<T>.
+   /// Factory function to create managed_ptrs.
+   ///
+   /// @param[in] f The factory function that will create the object
+   /// @param[in] args The arguments to the factory function
+   ///
+   template <typename T,
+             typename F,
+             typename... Args>
+   CHAI_HOST managed_ptr<T> make_managed_from_factory(F&& f, Args&&... args) {
+      static_assert(detail::is_invocable<F, Args...>::value,
+                    "F is not invocable with the given arguments.");
+
+      static_assert(std::is_pointer<typename std::result_of<F(Args...)>::type>::value,
+                    "F does not return a pointer.");
+
+      using R = typename std::remove_pointer<typename std::result_of<F(Args...)>::type>::type;
+
+      static_assert(std::is_convertible<R*, T*>::value,
+                    "F does not return a pointer that is convertible to T*.");
+
+#ifdef __CUDACC__
+      // Construct on the GPU first to take advantage of asynchrony
+      T* gpuPointer = detail::make_on_device_from_factory<R>(f, args...);
+#endif
+
+      // Construct on the CPU
+      T* cpuPointer = detail::make_on_host_from_factory<R>(f, args...);
+
+      // Construct and return the managed_ptr
+#ifdef __CUDACC__
+      return managed_ptr<T>({CPU, GPU}, {cpuPointer, gpuPointer});
+#else
+      return managed_ptr<T>({CPU}, {cpuPointer});
+#endif
+   }
+
    ///
    /// @author Alan Dayton
    ///
diff --git a/tests/integration/managed_ptr_tests.cpp b/tests/integration/managed_ptr_tests.cpp
index 908074fa..105135f6 100644
--- a/tests/integration/managed_ptr_tests.cpp
+++ b/tests/integration/managed_ptr_tests.cpp
@@ -200,8 +200,8 @@ TEST(managed_ptr, class_with_raw_array)
 
   ASSERT_EQ(rawArrayClass->getValue(0), expectedValue);
 
-  array.free();
   rawArrayClass.free();
+  array.free();
 }
 
 TEST(managed_ptr, class_with_multiple_raw_arrays)
@@ -222,9 +222,9 @@ TEST(managed_ptr, class_with_multiple_raw_arrays)
   ASSERT_EQ(multipleRawArrayClass->getValue(0, 0), expectedValue1);
   ASSERT_EQ(multipleRawArrayClass->getValue(1, 0), expectedValue2);
 
-  array1.free();
-  array2.free();
   multipleRawArrayClass.free();
+  array2.free();
+  array1.free();
 }
 
 TEST(managed_ptr, class_with_managed_array)
@@ -241,8 +241,8 @@ TEST(managed_ptr, class_with_managed_array)
 
   ASSERT_EQ(derived->getValue(0), expectedValue);
 
-  array.free();
   derived.free();
+  array.free();
 }
 
 TEST(managed_ptr, class_with_raw_ptr)
@@ -260,9 +260,9 @@ TEST(managed_ptr, class_with_raw_ptr)
 
   ASSERT_EQ((*rawPointerClass).getValue(0), expectedValue);
 
-  array.free();
-  rawArrayClass.free();
   rawPointerClass.free();
+  rawArrayClass.free();
+  array.free();
 }
 
 TEST(managed_ptr, class_with_managed_ptr)
@@ -286,8 +286,77 @@ TEST(managed_ptr, nested_managed_ptr)
 
   ASSERT_EQ(container->getValue(), expectedValue);
 
-  derived.free();
   container.free();
+  derived.free();
+}
+
+TEST(managed_ptr, array_of_managed_ptr)
+{
+  int numManagedPointers = 10;
+
+  int* expectedValues = new int[numManagedPointers];
+
+  chai::managed_ptr<TestInner>* managedPointers = new chai::managed_ptr<TestInner>[numManagedPointers];
+
+  for (int i = 0; i < numManagedPointers; ++i) {
+     const int expectedValue = rand();
+     expectedValues[i] = expectedValue;
+     managedPointers[i] = chai::make_managed<TestInner>(expectedValue);
+  }
+
+  for (int i = 0; i < numManagedPointers; ++i) {
+     ASSERT_EQ(managedPointers[i]->getValue(), expectedValues[i]);
+     managedPointers[i].free();
+  }
+
+  delete[] managedPointers;
+  delete[] expectedValues;
+}
+
+TEST(managed_ptr, c_array_of_managed_ptr)
+{
+  int numManagedPointers = 10;
+
+  int* expectedValues = new int[numManagedPointers];
+
+  chai::managed_ptr<TestInner>* managedPointers = (chai::managed_ptr<TestInner>*) malloc(numManagedPointers*sizeof(chai::managed_ptr<TestInner>));
+
+  for (int i = 0; i < numManagedPointers; ++i) {
+     const int expectedValue = rand();
+     expectedValues[i] = expectedValue;
+     managedPointers[i] = chai::make_managed<TestInner>(expectedValue);
+  }
+
+  for (int i = 0; i < numManagedPointers; ++i) {
+     ASSERT_EQ(managedPointers[i]->getValue(), expectedValues[i]);
+     managedPointers[i].free();
+  }
+
+  free(managedPointers);
+  delete[] expectedValues;
+}
+
+TEST(managed_ptr, managed_array_of_managed_ptr)
+{
+  int numManagedPointers = 10;
+
+  int* expectedValues = new int[numManagedPointers];
+
+  chai::ManagedArray<chai::managed_ptr<TestInner>> managedPointers(numManagedPointers, chai::CPU);
+
+  forall(sequential(), 0, numManagedPointers, [=] (int i) {
+     const int expectedValue = rand();
+     expectedValues[i] = expectedValue;
+     managedPointers[i] = chai::make_managed<TestInner>(expectedValue);
+  });
+
+  forall(sequential(), 0, numManagedPointers, [=] (int i) {
+     ASSERT_EQ(managedPointers[i]->getValue(), expectedValues[i]);
+     managedPointers[i].free();
+  });
+
+  managedPointers.free();
+  delete[] expectedValues;
 }
 
 #ifdef __CUDACC__
@@ -331,36 +400,61 @@ GPU_TEST(managed_ptr, make_on_device)
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
-  // Initialize device side memory to hold the new object
-  RawArrayClass* gpuPointer = nullptr;
-  cudaMalloc(&gpuPointer, sizeof(RawArrayClass));
+  // Initialize host side memory to hold a pointer
+  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  RawArrayClass** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
 
-  // Check the pointer
-  ASSERT_NE(gpuPointer, nullptr);
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  RawArrayClass* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
 
-  // Clean up on the device
   chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
 GPU_TEST(managed_ptr, gpu_build_managed_ptr)
 {
-  // Initialize device side memory to hold the new object
-  RawArrayClass* gpuPointer = nullptr;
-  cudaMalloc(&gpuPointer, sizeof(RawArrayClass));
+  // Initialize host side memory to hold a pointer
+  RawArrayClass** cpuPointerHolder = (RawArrayClass**) malloc(sizeof(RawArrayClass*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  RawArrayClass** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(RawArrayClass*));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
 
-  // Check the pointer
-  ASSERT_NE(gpuPointer, nullptr);
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(RawArrayClass*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  RawArrayClass* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
 
-  // Make a managed_ptr
   chai::managed_ptr<RawArrayClass> managedPtr({chai::GPU}, {gpuPointer});
 
-  // Clean up the memory
   managedPtr.free();
 }
 
@@ -404,9 +498,9 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_array)
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
 
-  array.free();
-  rawArrayClass.free();
   results.free();
+  rawArrayClass.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_raw_array_and_callback)
@@ -476,9 +570,9 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_array)
 
   ASSERT_EQ(results[0], expectedValue);
 
-  array.free();
-  derived.free();
   results.free();
+  derived.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
@@ -503,10 +597,10 @@ GPU_TEST(managed_ptr, gpu_class_with_raw_ptr)
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
 
-  array.free();
-  rawArrayClass.free();
-  rawPointerClass.free();
   results.free();
+  rawPointerClass.free();
+  rawArrayClass.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, gpu_class_with_managed_ptr)
@@ -525,8 +619,8 @@ GPU_TEST(managed_ptr, gpu_class_with_managed_ptr)
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
 
-  derived.free();
   results.free();
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_nested_managed_ptr)
@@ -545,9 +639,9 @@ GPU_TEST(managed_ptr, gpu_nested_managed_ptr)
   results.move(chai::CPU);
   ASSERT_EQ(results[0], expectedValue);
 
-  derived.free();
-  container.free();
   results.free();
+  container.free();
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_multiple_inheritance)
@@ -569,8 +663,8 @@ GPU_TEST(managed_ptr, gpu_multiple_inheritance)
   ASSERT_EQ(results[0], true);
   ASSERT_EQ(results[1], true);
 
-  derived.free();
   results.free();
+  base2.free();
 }
 
 GPU_TEST(managed_ptr, static_pointer_cast)
@@ -601,9 +695,9 @@ GPU_TEST(managed_ptr, static_pointer_cast)
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
 
-  array.free();
-  derived.free();
   results.free();
+  derivedFromBase.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, dynamic_pointer_cast)
@@ -634,9 +728,9 @@ GPU_TEST(managed_ptr, dynamic_pointer_cast)
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
 
-  array.free();
-  base.free();
   results.free();
+  derivedFromBase.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, const_pointer_cast)
@@ -667,9 +761,9 @@ GPU_TEST(managed_ptr, const_pointer_cast)
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
 
-  array.free();
-  derivedFromConst.free();
   results.free();
+  constDerived.free();
+  array.free();
 }
 
 GPU_TEST(managed_ptr, reinterpret_pointer_cast)
@@ -700,9 +794,9 @@ GPU_TEST(managed_ptr, reinterpret_pointer_cast)
   ASSERT_EQ(results[1], expectedValue);
   ASSERT_EQ(results[2], expectedValue);
 
-  array.free();
-  derived.free();
   results.free();
+  derivedFromBase.free();
+  array.free();
 }
 
 #endif
diff --git a/tests/unit/managed_ptr_unit_tests.cpp b/tests/unit/managed_ptr_unit_tests.cpp
index 281e88a5..5e7bb17b 100644
--- a/tests/unit/managed_ptr_unit_tests.cpp
+++ b/tests/unit/managed_ptr_unit_tests.cpp
@@ -141,7 +141,6 @@ TEST(managed_ptr, default_constructor)
   EXPECT_FALSE(derived != otherDerived);
   EXPECT_FALSE(otherDerived != derived);
 
-  // Make sure free is a no-op
   derived.free();
   otherDerived.free();
 }
@@ -162,9 +161,8 @@ TEST(managed_ptr, nullptr_constructor)
   EXPECT_FALSE(derived != otherDerived);
   EXPECT_FALSE(otherDerived != derived);
 
-  // Make sure free is a no-op
-  derived.free();
   otherDerived.free();
+  derived.free();
 }
 
 TEST(managed_ptr, cpu_pointer_constructor)
@@ -289,7 +287,7 @@ TEST(managed_ptr, copy_assignment_operator)
   EXPECT_TRUE(otherDerived == derived);
   EXPECT_FALSE(otherDerived != derived);
 
-  derived.free();
+  otherDerived.free();
 }
 
 TEST(managed_ptr, copy_constructor_from_default_constructed)
@@ -306,6 +304,8 @@ TEST(managed_ptr, copy_constructor_from_default_constructed)
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, copy_assignment_operator_from_default_constructed)
@@ -323,6 +323,8 @@ TEST(managed_ptr, copy_assignment_operator_from_default_constructed)
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
+
+  otherDerived.free();
 }
 
 TEST(managed_ptr, conversion_copy_constructor_from_default_constructed)
@@ -339,6 +341,8 @@ TEST(managed_ptr, conversion_copy_constructor_from_default_constructed)
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
+
+  otherDerived.free();
 }
 
 TEST(managed_ptr, conversion_copy_assignment_operator_from_default_constructed)
@@ -356,6 +360,8 @@ TEST(managed_ptr, conversion_copy_assignment_operator_from_default_constructed)
   EXPECT_EQ(bool(otherDerived), false);
   EXPECT_EQ(otherDerived, nullptr);
   EXPECT_EQ(nullptr, otherDerived);
+
+  derived.free();
 }
 
 TEST(managed_ptr, copy_assignment_operator_from_host_ptr_constructed)
@@ -384,8 +390,8 @@ TEST(managed_ptr, copy_assignment_operator_from_host_ptr_constructed)
   EXPECT_NE(thirdDerived, nullptr);
   EXPECT_NE(nullptr, thirdDerived);
 
+  derived.free();
   otherDerived.free();
-  thirdDerived.free();
 }
 
 TEST(managed_ptr, conversion_copy_assignment_operator_from_host_ptr_constructed)
@@ -434,7 +440,7 @@ TEST(managed_ptr, static_pointer_cast)
   EXPECT_TRUE(base != nullptr);
   EXPECT_TRUE(nullptr != base);
 
-  derived.free();
+  base.free();
 }
 
 TEST(managed_ptr, dynamic_pointer_cast)
@@ -531,8 +537,11 @@ GPU_TEST(managed_ptr, gpu_default_constructor)
   EXPECT_FALSE(array2[7]);
   EXPECT_FALSE(array2[8]);
 
-  array.free();
   array2.free();
+  array.free();
+
+  derived.free();
+  otherDerived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_nullptr_constructor)
@@ -570,11 +579,14 @@ GPU_TEST(managed_ptr, gpu_nullptr_constructor)
   EXPECT_FALSE(array2[7]);
   EXPECT_FALSE(array2[8]);
 
-  array.free();
   array2.free();
+  array.free();
+
+  derived.free();
+  otherDerived.free();
 }
 
-GPU_TEST(managed_ptr, gpu_pointer_constructor)
+GPU_TEST(managed_ptr, gpu_gpu_pointer_constructor)
 {
   TestDerived* gpuPointer = chai::detail::make_on_device<TestDerived>(3);
   chai::managed_ptr<TestDerived> derived({chai::GPU}, {gpuPointer});
@@ -612,47 +624,73 @@ GPU_TEST(managed_ptr, gpu_pointer_constructor)
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
 
-  array1.free();
-  array2.free();
   array3.free();
+  array2.free();
+  array1.free();
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device)
 {
-  // Initialize device side memory to hold the new object
-  Simple* gpuPointer = nullptr;
-  cudaMalloc(&gpuPointer, sizeof(Simple));
+  // Initialize host side memory to hold a pointer
+  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  Simple** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
+
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
+
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
 
-  // Check the pointer
-  ASSERT_NE(gpuPointer, nullptr);
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  Simple* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
 
-  // Clean up on the device
   chai::detail::destroy_on_device<<<1, 1>>>(gpuPointer);
 }
 
 GPU_TEST(managed_ptr, gpu_new_and_delete_on_device_2)
 {
-  // Initialize device side memory to hold a the new object
-  Simple* gpuPointer = nullptr;
-  cudaMalloc(&gpuPointer, sizeof(Simple));
+  // Initialize host side memory to hold a pointer
+  Simple** cpuPointerHolder = (Simple**) malloc(sizeof(Simple*));
+  cpuPointerHolder[0] = nullptr;
+
+  // Initialize device side memory to hold a pointer
+  Simple** gpuPointerHolder = nullptr;
+  cudaMalloc(&gpuPointerHolder, sizeof(Simple*));
 
   // Create on the device
-  chai::detail::make_on_device<<<1, 1>>>(gpuPointer);
+  chai::detail::make_on_device<<<1, 1>>>(gpuPointerHolder);
 
-  // Check the pointer
-  ASSERT_NE(gpuPointer, nullptr);
+  // Copy to the host side memory
+  cudaMemcpy(cpuPointerHolder, gpuPointerHolder, sizeof(Simple*), cudaMemcpyDeviceToHost);
 
-  // Create a managed_ptr
-  chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
+  // Free device side memory
+  cudaFree(gpuPointerHolder);
+
+  // Save the pointer
+  ASSERT_NE(cpuPointerHolder[0], nullptr);
+  Simple* gpuPointer = cpuPointerHolder[0];
+
+  // Free host side memory
+  free(cpuPointerHolder);
 
-  // Free the memory
+  chai::managed_ptr<Simple> test({chai::GPU}, {gpuPointer});
   test.free();
 }
 
-GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
+GPU_TEST(managed_ptr, simple_gpu_cpu_and_gpu_pointer_constructor)
 {
   Simple* gpuPointer = chai::detail::make_on_device<Simple>(3);
   Simple* cpuPointer = new Simple(4);
@@ -672,6 +710,7 @@ GPU_TEST(managed_ptr, simple_cuda_cpu_and_gpu_pointer_constructor)
   EXPECT_EQ(array1[0], 3);
 
   array1.free();
+
   simple.free();
 }
 
@@ -716,9 +755,10 @@ GPU_TEST(managed_ptr, gpu_cpu_and_gpu_pointer_constructor)
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
 
-  array1.free();
-  array2.free();
   array3.free();
+  array2.free();
+  array1.free();
+
   derived.free();
 }
 
@@ -754,9 +794,98 @@ GPU_TEST(managed_ptr, gpu_make_managed)
   EXPECT_TRUE(array3[3]);
   EXPECT_TRUE(array3[4]);
 
-  array.free();
-  array2.free();
   array3.free();
+  array2.free();
+  array.free();
+
+  derived.free();
+}
+
+GPU_TEST(managed_ptr, make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return Factory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
+}
+
+GPU_TEST(managed_ptr, make_managed_from_factory_lambda)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
+}
+
+GPU_TEST(managed_ptr, make_managed_from_overloaded_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return OverloadedFactory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+
+  derived.free();
+}
+
+GPU_TEST(managed_ptr, make_managed_from_factory_static_member_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST_DEVICE (const int value) {
+    return TestBase::Factory(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+
+  EXPECT_NE(derived.get(), nullptr);
+  EXPECT_TRUE(derived);
+  EXPECT_FALSE(derived == nullptr);
+  EXPECT_FALSE(nullptr == derived);
+  EXPECT_TRUE(derived != nullptr);
+  EXPECT_TRUE(nullptr != derived);
+
   derived.free();
 }
 
@@ -817,10 +946,11 @@ GPU_TEST(managed_ptr, gpu_copy_constructor)
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
 
-  array.free();
-  array2.free();
   array3.free();
-  otherDerived.free();
+  array2.free();
+  array.free();
+
+  derived.free();
 }
 
 GPU_TEST(managed_ptr, gpu_converting_constructor)
@@ -880,10 +1010,11 @@ GPU_TEST(managed_ptr, gpu_converting_constructor)
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
 
-  array.free();
-  array2.free();
   array3.free();
-  derived.free();
+  array2.free();
+  array.free();
+
+  base.free();
 }
 
 GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
@@ -944,10 +1075,55 @@ GPU_TEST(managed_ptr, gpu_copy_assignment_operator)
   EXPECT_TRUE(array3[12]);
   EXPECT_FALSE(array3[13]);
 
-  array.free();
-  array2.free();
   array3.free();
-  otherDerived.free();
+  array2.free();
+  array.free();
+
+  derived.free();
+}
+
+#endif
+
+// Enable the following tests to ensure that proper compiler errors are given
+// for bad arguments since otherwise it is difficult to make sure the template
+// metaprogramming is correct.
+
+#if 0
+
+// Should give something like the following:
+// error: static assertion failed: F is not invocable with the given arguments.
+
+TEST(managed_ptr, bad_function_to_make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(expectedValue, factory);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
+}
+
+#endif
+
+#if 0
+
+// Should give something like the following:
+// error: static assertion failed: F is not invocable with the given arguments.
+
+TEST(managed_ptr, bad_arguments_to_make_managed_from_factory_function)
+{
+  const int expectedValue = rand();
+
+  auto factory = [] CHAI_HOST (const int value) {
+    return new TestDerived(value);
+  };
+
+  auto derived = chai::make_managed_from_factory<TestBase>(factory, expectedValue, 3);
+
+  EXPECT_EQ((*derived).getValue(), expectedValue);
 }
 
 #endif

From d6db9218c3f1616b97f55187a48a9891b6173766 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 12:49:59 -0800
Subject: [PATCH 54/58] Reduce duplication in benchmarks

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 347 +++++++++++++++++++++
 1 file changed, 347 insertions(+)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index b47c2a48..fc2b82b2 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -52,6 +52,8 @@
 class Base {
    public:
       CHAI_HOST_DEVICE virtual void scale(size_t numValues, int* values) = 0;
+
+      CHAI_HOST_DEVICE virtual void sumAndScale(size_t numValues, int* values, int& value) = 0;
 };
 
 class Derived : public Base {
@@ -64,6 +66,16 @@ class Derived : public Base {
          }
       }
 
+      CHAI_HOST_DEVICE virtual void sumAndScale(size_t numValues, int* values, int& value) override {
+         int result = 0;
+
+         for (size_t i = 0; i < numValues; ++i) {
+            result += values[i];
+         }
+
+         value *= m_value * result;
+      }
+
    private:
       int m_value = -1;
 };
@@ -74,6 +86,10 @@ class BaseCRTP {
       CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
          return static_cast<T*>(this)->scale(numValues, values);
       }
+
+      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+         return static_cast<T*>(this)->sumAndScale(numValues, values, value);
+      }
 };
 
 class DerivedCRTP : public BaseCRTP<DerivedCRTP> {
@@ -86,6 +102,16 @@ class DerivedCRTP : public BaseCRTP<DerivedCRTP> {
          }
       }
 
+      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+         int result = 0;
+
+         for (size_t i = 0; i < numValues; ++i) {
+            result += values[i];
+         }
+
+         value *= m_value * result;
+      }
+
    private:
       int m_value = -1;
 };
@@ -100,6 +126,16 @@ class NoInheritance {
          }
       }
 
+      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+         int result = 0;
+
+         for (size_t i = 0; i < numValues; ++i) {
+            result += values[i];
+         }
+
+         value *= m_value * result;
+      }
+
    private:
       int m_value = -1;
 };
@@ -390,6 +426,7 @@ void benchmark_use_managed_ptr_gpu(benchmark::State& state)
 
 BENCHMARK(benchmark_use_managed_ptr_gpu);
 
+
 // Curiously recurring template pattern
 __global__ void square(BaseCRTP<DerivedCRTP> object, size_t numValues, int* values) {
    object.scale(numValues, values);
@@ -448,6 +485,316 @@ void benchmark_no_inheritance_gpu(benchmark::State& state)
 
 BENCHMARK(benchmark_no_inheritance_gpu);
 
+__global__ void square(size_t numValues, int* values, chai::managed_ptr<Base> object) {
+   int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+   if (i < numValues) {
+      int temp[4] = {i, i+1, i+2, i+3};
+      object->sumAndScale(4, temp, values[i]);
+   }
+}
+
+// managed_ptr (bulk)
+template <size_t N>
+void benchmark_bulk_use_managed_ptr_gpu(benchmark::State& state)
+{
+  chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
+
+  int* values;
+  cudaMalloc(&values, N * sizeof(int));
+  fill<<<(N+255)/256, 256>>>(N, values);
+
+  cudaDeviceSynchronize();
+
+  while (state.KeepRunning()) {
+    square<<<(N+255)/256, 256>>>(N, values, object);
+    cudaDeviceSynchronize();
+  }
+
+  cudaFree(values);
+  object.free();
+  cudaDeviceSynchronize();
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 2097152);
+
+// Curiously recurring template pattern
+__global__ void square(size_t numValues, int* values, BaseCRTP<DerivedCRTP> object) {
+   int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+   if (i < numValues) {
+      int temp[4] = {i, i+1, i+2, i+3};
+      object.sumAndScale(4, temp, values[i]);
+   }
+}
+
+template <size_t N>
+void benchmark_bulk_curiously_recurring_template_pattern_gpu(benchmark::State& state)
+{
+  BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(2);
+  auto object = *derivedCRTP;
+
+  int* values;
+  cudaMalloc(&values, N * sizeof(int));
+  fill<<<(N+255)/256, 256>>>(N, values);
+
+  cudaDeviceSynchronize();
+
+  while (state.KeepRunning()) {
+    square<<<(N+255)/256, 256>>>(N, values, object);
+    cudaDeviceSynchronize();
+  }
+
+  cudaFree(values);
+  delete derivedCRTP;
+  cudaDeviceSynchronize();
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 2097152);
+
+// Class without inheritance
+__global__ void square(size_t numValues, int* values, NoInheritance object) {
+   int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+   if (i < numValues) {
+      int temp[4] = {i, i+1, i+2, i+3};
+      object.sumAndScale(4, temp, values[i]);
+   }
+}
+
+template <size_t N>
+void benchmark_bulk_no_inheritance_gpu(benchmark::State& state)
+{
+  NoInheritance* noInheritance = new NoInheritance(2);
+  auto object = *noInheritance;
+
+  int* values;
+  cudaMalloc(&values, N * sizeof(int));
+  fill<<<(N+255)/256, 256>>>(N, values);
+
+  cudaDeviceSynchronize();
+
+  while (state.KeepRunning()) {
+    square<<<(N+255)/256, 256>>>(N, values, object);
+    cudaDeviceSynchronize();
+  }
+
+  cudaFree(values);
+  delete noInheritance;
+  cudaDeviceSynchronize();
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 2097152);
+
+#endif
+
+// managed_ptr
+template <size_t N>
+static void benchmark_bulk_polymorphism_cpu(benchmark::State& state)
+{
+  Base* object = new Derived(2);
+
+  int* values = (int*) malloc(N * sizeof(int));
+
+  for (size_t i = 0; i < N; ++i) {
+     values[i] = i * i;
+  }
+
+#ifdef __CUDACC__
+  cudaDeviceSynchronize();
+#endif
+
+  while (state.KeepRunning()) {
+    for (int i = 0; i < N; ++i) {
+       int temp[4] = {i, i+1, i+2, i+3};
+       object->sumAndScale(4, temp, values[i]);
+    }
+  }
+
+  delete object;
+#ifdef __CUDACC__
+  cudaDeviceSynchronize();
+#endif
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 2097152);
+
+// managed_ptr
+template <size_t N>
+static void benchmark_bulk_use_managed_ptr_cpu(benchmark::State& state)
+{
+  chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
+
+  int* values = (int*) malloc(N * sizeof(int));
+
+  for (size_t i = 0; i < N; ++i) {
+     values[i] = i * i;
+  }
+
+#ifdef __CUDACC__
+  cudaDeviceSynchronize();
 #endif
 
+  while (state.KeepRunning()) {
+    for (int i = 0; i < N; ++i) {
+       int temp[4] = {i, i+1, i+2, i+3};
+       object->sumAndScale(4, temp, values[i]);
+    }
+  }
+
+  object.free();
+  cudaDeviceSynchronize();
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 2097152);
+
+// Curiously recurring template pattern
+template <size_t N>
+static void benchmark_bulk_curiously_recurring_template_pattern_cpu(benchmark::State& state)
+{
+  BaseCRTP<DerivedCRTP>* object = new DerivedCRTP(2);
+
+  int* values = (int*) malloc(N * sizeof(int));
+
+  for (size_t i = 0; i < N; ++i) {
+     values[i] = i * i;
+  }
+
+  while (state.KeepRunning()) {
+    for (int i = 0; i < N; ++i) {
+       int temp[4] = {i, i+1, i+2, i+3};
+       object->sumAndScale(4, temp, values[i]);
+    }
+  }
+
+  free(values);
+  delete object;
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 2097152);
+
+// Class without inheritance
+template <size_t N>
+static void benchmark_bulk_no_inheritance_cpu(benchmark::State& state)
+{
+  NoInheritance* object = new NoInheritance(2);
+
+  int* values = (int*) malloc(N * sizeof(int));
+
+  for (size_t i = 0; i < N; ++i) {
+     values[i] = i * i;
+  }
+
+  while (state.KeepRunning()) {
+    for (int i = 0; i < N; ++i) {
+       int temp[4] = {i, i+1, i+2, i+3};
+       object->sumAndScale(4, temp, values[i]);
+    }
+  }
+
+  free(values);
+  delete object;
+}
+
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 1);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 256);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 512);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 1024);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 2048);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 4096);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 8192);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 16384);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 32768);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 65536);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 131072);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 262144);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 524288);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 1048576);
+BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_cpu, 2097152);
+
 BENCHMARK_MAIN();
+

From 403f9ba97e973e7a98da60d11740252cfbc6a34b Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 15:49:23 -0800
Subject: [PATCH 55/58] Fix benchmarks

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 122 +++++++++++----------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index fc2b82b2..f23611e3 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -51,25 +51,27 @@
 
 class Base {
    public:
-      CHAI_HOST_DEVICE virtual void scale(size_t numValues, int* values) = 0;
+      CHAI_HOST_DEVICE virtual void scale(int numValues, int* values) = 0;
 
-      CHAI_HOST_DEVICE virtual void sumAndScale(size_t numValues, int* values, int& value) = 0;
+      CHAI_HOST_DEVICE virtual void sumAndScale(int numValues, int* values, int& value) = 0;
+
+      CHAI_HOST_DEVICE virtual ~Base() = default;
 };
 
 class Derived : public Base {
    public:
       CHAI_HOST_DEVICE Derived(int value) : Base(), m_value(value) {}
 
-      CHAI_HOST_DEVICE virtual void scale(size_t numValues, int* values) override {
-         for (size_t i = 0; i < numValues; ++i) {
+      CHAI_HOST_DEVICE virtual void scale(int numValues, int* values) override {
+         for (int i = 0; i < numValues; ++i) {
             values[i] *= m_value;
          }
       }
 
-      CHAI_HOST_DEVICE virtual void sumAndScale(size_t numValues, int* values, int& value) override {
+      CHAI_HOST_DEVICE virtual void sumAndScale(int numValues, int* values, int& value) override {
          int result = 0;
 
-         for (size_t i = 0; i < numValues; ++i) {
+         for (int i = 0; i < numValues; ++i) {
             result += values[i];
          }
 
@@ -83,11 +85,11 @@ class Derived : public Base {
 template <typename T>
 class BaseCRTP {
    public:
-      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
+      CHAI_HOST_DEVICE void scale(int numValues, int* values) {
          return static_cast<T*>(this)->scale(numValues, values);
       }
 
-      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+      CHAI_HOST_DEVICE void sumAndScale(int numValues, int* values, int& value) {
          return static_cast<T*>(this)->sumAndScale(numValues, values, value);
       }
 };
@@ -96,16 +98,16 @@ class DerivedCRTP : public BaseCRTP<DerivedCRTP> {
    public:
       CHAI_HOST_DEVICE DerivedCRTP(int value) : BaseCRTP<DerivedCRTP>(), m_value(value) {}
 
-      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
-         for (size_t i = 0; i < numValues; ++i) {
+      CHAI_HOST_DEVICE void scale(int numValues, int* values) {
+         for (int i = 0; i < numValues; ++i) {
             values[i] *= m_value;
          }
       }
 
-      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+      CHAI_HOST_DEVICE void sumAndScale(int numValues, int* values, int& value) {
          int result = 0;
 
-         for (size_t i = 0; i < numValues; ++i) {
+         for (int i = 0; i < numValues; ++i) {
             result += values[i];
          }
 
@@ -120,16 +122,16 @@ class NoInheritance {
    public:
       CHAI_HOST_DEVICE NoInheritance(int value) : m_value(value) {}
 
-      CHAI_HOST_DEVICE void scale(size_t numValues, int* values) {
-         for (size_t i = 0; i < numValues; ++i) {
+      CHAI_HOST_DEVICE void scale(int numValues, int* values) {
+         for (int i = 0; i < numValues; ++i) {
             values[i] *= m_value;
          }
       }
 
-      CHAI_HOST_DEVICE void sumAndScale(size_t numValues, int* values, int& value) {
+      CHAI_HOST_DEVICE void sumAndScale(int numValues, int* values, int& value) {
          int result = 0;
 
-         for (size_t i = 0; i < numValues; ++i) {
+         for (int i = 0; i < numValues; ++i) {
             result += values[i];
          }
 
@@ -140,7 +142,7 @@ class NoInheritance {
       int m_value = -1;
 };
 
-template <size_t N>
+template <int N>
 class ClassWithSize {
    private:
       char m_values[N];
@@ -161,10 +163,10 @@ static void benchmark_use_managed_ptr_cpu(benchmark::State& state)
 {
   chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values = (int*) malloc(100 * sizeof(int));
 
-  for (size_t i = 0; i < numValues; ++i) {
+  for (int i = 0; i < numValues; ++i) {
      values[i] = i * i;
   }
 
@@ -177,7 +179,9 @@ static void benchmark_use_managed_ptr_cpu(benchmark::State& state)
   }
 
   object.free();
+#ifdef __CUDACC__
   cudaDeviceSynchronize();
+#endif
 }
 
 BENCHMARK(benchmark_use_managed_ptr_cpu);
@@ -187,10 +191,10 @@ static void benchmark_curiously_recurring_template_pattern_cpu(benchmark::State&
 {
   BaseCRTP<DerivedCRTP>* object = new DerivedCRTP(2);
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values = (int*) malloc(100 * sizeof(int));
 
-  for (size_t i = 0; i < numValues; ++i) {
+  for (int i = 0; i < numValues; ++i) {
      values[i] = i * i;
   }
 
@@ -209,10 +213,10 @@ static void benchmark_no_inheritance_cpu(benchmark::State& state)
 {
   NoInheritance* object = new NoInheritance(2);
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values = (int*) malloc(100 * sizeof(int));
 
-  for (size_t i = 0; i < numValues; ++i) {
+  for (int i = 0; i < numValues; ++i) {
      values[i] = i * i;
   }
 
@@ -228,11 +232,11 @@ BENCHMARK(benchmark_no_inheritance_cpu);
 
 #if defined(CHAI_ENABLE_CUDA) || defined(CHAI_ENABLE_HIP)
 
-template <size_t N>
+template <int N>
 __global__ void copy_kernel(ClassWithSize<N>) {}
 
 // Benchmark how long it takes to copy a class to the GPU
-template <size_t N>
+template <int N>
 static void benchmark_pass_copy_to_gpu(benchmark::State& state)
 {
   ClassWithSize<N> helper;
@@ -248,7 +252,7 @@ BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 64);
 BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 512);
 BENCHMARK_TEMPLATE(benchmark_pass_copy_to_gpu, 4096);
 
-template <size_t N>
+template <int N>
 static void benchmark_copy_to_gpu(benchmark::State& state)
 {
   ClassWithSize<N>* cpuPointer = new ClassWithSize<N>();
@@ -273,17 +277,17 @@ BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 262144);
 BENCHMARK_TEMPLATE(benchmark_copy_to_gpu, 2097152);
 
 // Benchmark how long it takes to call placement new on the GPU
-template <size_t N>
+template <int N>
 __global__ void placement_new_kernel(ClassWithSize<N>* address) {
    (void) new(address) ClassWithSize<N>();
 }
 
-template <size_t N>
+template <int N>
 __global__ void placement_delete_kernel(ClassWithSize<N>* address) {
    address->~ClassWithSize<N>();
 }
 
-template <size_t N>
+template <int N>
 static void benchmark_placement_new_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
@@ -305,17 +309,17 @@ BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 262144);
 BENCHMARK_TEMPLATE(benchmark_placement_new_on_gpu, 2097152);
 
 // Benchmark how long it takes to call new on the GPU
-template <size_t N>
+template <int N>
 __global__ void create_kernel(ClassWithSize<N>** address) {
    *address = new ClassWithSize<N>();
 }
 
-template <size_t N>
+template <int N>
 __global__ void delete_kernel(ClassWithSize<N>** address) {
    delete *address;
 }
 
-template <size_t N>
+template <int N>
 static void benchmark_new_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
@@ -337,12 +341,12 @@ BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 262144);
 BENCHMARK_TEMPLATE(benchmark_new_on_gpu, 2097152);
 
 // Benchmark current approach
-template <size_t N>
+template <int N>
 __global__ void delete_kernel_2(ClassWithSize<N>* address) {
    delete address;
 }
 
-template <size_t N>
+template <int N>
 static void benchmark_new_on_gpu_and_copy_to_host(benchmark::State& state)
 {
   while (state.KeepRunning()) {
@@ -368,12 +372,12 @@ BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 262144);
 BENCHMARK_TEMPLATE(benchmark_new_on_gpu_and_copy_to_host, 2097152);
 
 // Benchmark how long it takes to create a stack object on the GPU
-template <size_t N>
+template <int N>
 __global__ void create_on_stack_kernel() {
    (void) ClassWithSize<N>();
 }
 
-template <size_t N>
+template <int N>
 static void benchmark_create_on_stack_on_gpu(benchmark::State& state)
 {
   while (state.KeepRunning()) {
@@ -391,15 +395,15 @@ BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 262144);
 BENCHMARK_TEMPLATE(benchmark_create_on_stack_on_gpu, 2097152);
 
 // Use managed_ptr
-__global__ void fill(size_t numValues, int* values) {
-   size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void fill(int numValues, int* values) {
+   int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < numValues) {
       values[i] = i * i;
    }
 }
 
-__global__ void square(chai::managed_ptr<Base> object, size_t numValues, int* values) {
+__global__ void square(chai::managed_ptr<Base> object, int numValues, int* values) {
    object->scale(numValues, values);
 }
 
@@ -407,7 +411,7 @@ void benchmark_use_managed_ptr_gpu(benchmark::State& state)
 {
   chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values;
   cudaMalloc(&values, numValues * sizeof(int));
   fill<<<1, 100>>>(numValues, values);
@@ -428,7 +432,7 @@ BENCHMARK(benchmark_use_managed_ptr_gpu);
 
 
 // Curiously recurring template pattern
-__global__ void square(BaseCRTP<DerivedCRTP> object, size_t numValues, int* values) {
+__global__ void square(BaseCRTP<DerivedCRTP> object, int numValues, int* values) {
    object.scale(numValues, values);
 }
 
@@ -437,7 +441,7 @@ void benchmark_curiously_recurring_template_pattern_gpu(benchmark::State& state)
   BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(2);
   auto object = *derivedCRTP;
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values;
   cudaMalloc(&values, numValues * sizeof(int));
   fill<<<1, 100>>>(numValues, values);
@@ -457,7 +461,7 @@ void benchmark_curiously_recurring_template_pattern_gpu(benchmark::State& state)
 BENCHMARK(benchmark_curiously_recurring_template_pattern_gpu);
 
 // Class without inheritance
-__global__ void square(NoInheritance object, size_t numValues, int* values) {
+__global__ void square(NoInheritance object, int numValues, int* values) {
    object.scale(numValues, values);
 }
 
@@ -466,7 +470,7 @@ void benchmark_no_inheritance_gpu(benchmark::State& state)
   NoInheritance* noInheritance = new NoInheritance(2);
   auto object = *noInheritance;
 
-  size_t numValues = 100;
+  int numValues = 100;
   int* values;
   cudaMalloc(&values, numValues * sizeof(int));
   fill<<<1, 100>>>(numValues, values);
@@ -485,7 +489,7 @@ void benchmark_no_inheritance_gpu(benchmark::State& state)
 
 BENCHMARK(benchmark_no_inheritance_gpu);
 
-__global__ void square(size_t numValues, int* values, chai::managed_ptr<Base> object) {
+__global__ void square(int numValues, int* values, chai::managed_ptr<Base> object) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < numValues) {
@@ -495,7 +499,7 @@ __global__ void square(size_t numValues, int* values, chai::managed_ptr<Base> ob
 }
 
 // managed_ptr (bulk)
-template <size_t N>
+template <int N>
 void benchmark_bulk_use_managed_ptr_gpu(benchmark::State& state)
 {
   chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
@@ -533,7 +537,7 @@ BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 1048576);
 BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_gpu, 2097152);
 
 // Curiously recurring template pattern
-__global__ void square(size_t numValues, int* values, BaseCRTP<DerivedCRTP> object) {
+__global__ void square(int numValues, int* values, BaseCRTP<DerivedCRTP> object) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < numValues) {
@@ -542,7 +546,7 @@ __global__ void square(size_t numValues, int* values, BaseCRTP<DerivedCRTP> obje
    }
 }
 
-template <size_t N>
+template <int N>
 void benchmark_bulk_curiously_recurring_template_pattern_gpu(benchmark::State& state)
 {
   BaseCRTP<DerivedCRTP>* derivedCRTP = new DerivedCRTP(2);
@@ -581,7 +585,7 @@ BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 1048
 BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_gpu, 2097152);
 
 // Class without inheritance
-__global__ void square(size_t numValues, int* values, NoInheritance object) {
+__global__ void square(int numValues, int* values, NoInheritance object) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
 
    if (i < numValues) {
@@ -590,7 +594,7 @@ __global__ void square(size_t numValues, int* values, NoInheritance object) {
    }
 }
 
-template <size_t N>
+template <int N>
 void benchmark_bulk_no_inheritance_gpu(benchmark::State& state)
 {
   NoInheritance* noInheritance = new NoInheritance(2);
@@ -631,14 +635,14 @@ BENCHMARK_TEMPLATE(benchmark_bulk_no_inheritance_gpu, 2097152);
 #endif
 
 // managed_ptr
-template <size_t N>
+template <int N>
 static void benchmark_bulk_polymorphism_cpu(benchmark::State& state)
 {
   Base* object = new Derived(2);
 
   int* values = (int*) malloc(N * sizeof(int));
 
-  for (size_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
      values[i] = i * i;
   }
 
@@ -676,14 +680,14 @@ BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 1048576);
 BENCHMARK_TEMPLATE(benchmark_bulk_polymorphism_cpu, 2097152);
 
 // managed_ptr
-template <size_t N>
+template <int N>
 static void benchmark_bulk_use_managed_ptr_cpu(benchmark::State& state)
 {
   chai::managed_ptr<Base> object = chai::make_managed<Derived>(2);
 
   int* values = (int*) malloc(N * sizeof(int));
 
-  for (size_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
      values[i] = i * i;
   }
 
@@ -699,7 +703,9 @@ static void benchmark_bulk_use_managed_ptr_cpu(benchmark::State& state)
   }
 
   object.free();
+#ifdef __CUDACC__
   cudaDeviceSynchronize();
+#endif
 }
 
 BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 1);
@@ -719,14 +725,14 @@ BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 1048576);
 BENCHMARK_TEMPLATE(benchmark_bulk_use_managed_ptr_cpu, 2097152);
 
 // Curiously recurring template pattern
-template <size_t N>
+template <int N>
 static void benchmark_bulk_curiously_recurring_template_pattern_cpu(benchmark::State& state)
 {
   BaseCRTP<DerivedCRTP>* object = new DerivedCRTP(2);
 
   int* values = (int*) malloc(N * sizeof(int));
 
-  for (size_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
      values[i] = i * i;
   }
 
@@ -758,14 +764,14 @@ BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 1048
 BENCHMARK_TEMPLATE(benchmark_bulk_curiously_recurring_template_pattern_cpu, 2097152);
 
 // Class without inheritance
-template <size_t N>
+template <int N>
 static void benchmark_bulk_no_inheritance_cpu(benchmark::State& state)
 {
   NoInheritance* object = new NoInheritance(2);
 
   int* values = (int*) malloc(N * sizeof(int));
 
-  for (size_t i = 0; i < N; ++i) {
+  for (int i = 0; i < N; ++i) {
      values[i] = i * i;
   }
 

From b77b805f4848dc1d1830d1fa1768ae862b0215c7 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 16:19:31 -0800
Subject: [PATCH 56/58] Fix memory leaks in benchmarks

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index f23611e3..57f95db1 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -178,7 +178,9 @@ static void benchmark_use_managed_ptr_cpu(benchmark::State& state)
     object->scale(numValues, values);
   }
 
+  free(values);
   object.free();
+
 #ifdef __CUDACC__
   cudaDeviceSynchronize();
 #endif
@@ -657,7 +659,9 @@ static void benchmark_bulk_polymorphism_cpu(benchmark::State& state)
     }
   }
 
+  free(values);
   delete object;
+
 #ifdef __CUDACC__
   cudaDeviceSynchronize();
 #endif
@@ -702,7 +706,9 @@ static void benchmark_bulk_use_managed_ptr_cpu(benchmark::State& state)
     }
   }
 
+  free(values);
   object.free();
+
 #ifdef __CUDACC__
   cudaDeviceSynchronize();
 #endif

From 804ae035295e061b32dee4099ee7953d41223774 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 16:33:45 -0800
Subject: [PATCH 57/58] Add capability to turn on/off cuda synchronizes

---
 src/chai/ArrayManager.hpp | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/chai/ArrayManager.hpp b/src/chai/ArrayManager.hpp
index beb38205..ddca8220 100644
--- a/src/chai/ArrayManager.hpp
+++ b/src/chai/ArrayManager.hpp
@@ -237,6 +237,21 @@ class ArrayManager
    */
   void disableCallbacks() { m_callbacks_active = false; }
 
+  /*!
+   * \brief Turn on device synchronization after every kernel.
+   */
+  void enableDeviceSynchronize() { m_device_synchronize = true; }
+
+  /*!
+   * \brief Turn off device synchronization after every kernel.
+   */
+  void disableDeviceSynchronize() { m_device_synchronize = false; }
+
+  /*!
+   * \brief Turn on device synchronization after every kernel.
+   */
+  bool deviceSynchronize() { return m_device_synchronize; }
+
 protected:
   /*!
    * \brief Construct a new ArrayManager.
@@ -318,6 +333,11 @@ class ArrayManager
    * \brief Controls whether or not callbacks are called.
    */
   bool m_callbacks_active;
+
+  /*!
+   * Whether or not to synchronize on device after every CHAI kernel.
+   */
+  bool m_device_synchronize = false;
 };
 
 }  // end of namespace chai

From a1a46e546bdeadeb7b40ed32d96a2423bbfea689 Mon Sep 17 00:00:00 2001
From: Alan Dayton <dayton8@llnl.gov>
Date: Thu, 5 Dec 2019 16:45:34 -0800
Subject: [PATCH 58/58] Fix nvcc build warnings

---
 benchmarks/chai_managed_ptr_benchmarks.cpp | 2 +-
 src/chai/managed_ptr.hpp                   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/chai_managed_ptr_benchmarks.cpp b/benchmarks/chai_managed_ptr_benchmarks.cpp
index 57f95db1..47bd5a88 100644
--- a/benchmarks/chai_managed_ptr_benchmarks.cpp
+++ b/benchmarks/chai_managed_ptr_benchmarks.cpp
@@ -55,7 +55,7 @@ class Base {
 
       CHAI_HOST_DEVICE virtual void sumAndScale(int numValues, int* values, int& value) = 0;
 
-      CHAI_HOST_DEVICE virtual ~Base() = default;
+      virtual ~Base() = default;
 };
 
 class Derived : public Base {
diff --git a/src/chai/managed_ptr.hpp b/src/chai/managed_ptr.hpp
index 49ffa0e8..39dca4b8 100644
--- a/src/chai/managed_ptr.hpp
+++ b/src/chai/managed_ptr.hpp
@@ -173,7 +173,7 @@ namespace chai {
          ///
          /// Default constructor.
          ///
-         CHAI_HOST_DEVICE constexpr managed_ptr() noexcept = default;
+         constexpr managed_ptr() noexcept = default;
 
          ///
          /// @author Alan Dayton