diff --git a/.clang-format b/.clang-format
index 1d2ad9a77f..d01fb458ca 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,27 +1,82 @@
-BasedOnStyle : google
+BasedOnStyle : LLVM
+# Indent formatting
 IndentWidth : 2
-BreakBeforeBraces : Linux
+Language: Cpp
+UseTab: Never
 KeepEmptyLinesAtTheStartOfBlocks : true
 MaxEmptyLinesToKeep : 2
 AccessModifierOffset : -2
-UseTab: Never
+# This must be off so that include order in RAJA is preserved
+SortIncludes: false
+
+# Alignment of consecutive declarations, assignments etc
+AlignConsecutiveAssignments : true
+AlignConsecutiveDeclarations : false
+AlignConsecutiveMacros : true
+AlignTrailingComments : true
+
+# Control curly brace placement
+BreakBeforeBraces : Custom
+BraceWrapping:
+  AfterCaseLabel:  true
+  AfterClass:      true
+  AfterControlStatement: true
+  AfterEnum:       true
+  AfterFunction:   true
+  AfterNamespace:  true
+  AfterObjCDeclaration: false
+  AfterStruct:     true
+  AfterUnion:      true
+  AfterExternBlock: false
+  BeforeCatch:     true
+  BeforeElse:      true
+  BeforeLambdaBody: false
+  IndentBraces:    false
+  SplitEmptyFunction: false
+  SplitEmptyRecord: false
+  SplitEmptyNamespace: false
+
+# Pointer alignment
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Single line config
 AllowShortIfStatementsOnASingleLine : true
-ConstructorInitializerAllOnOneLineOrOnePerLine : true
-AllowShortFunctionsOnASingleLine : true
+AllowShortFunctionsOnASingleLine : false
 AllowShortLoopsOnASingleLine : false
-BinPackParameters : false
+AllowShortLambdasOnASingleLine : None
+AllowAllArgumentsOnNextLine : true
 AllowAllParametersOfDeclarationOnNextLine : false
-AlignTrailingComments : true
+BinPackArguments : true
+BinPackParameters : false
+ConstructorInitializerAllOnOneLineOrOnePerLine : true
 ColumnLimit : 80
-PenaltyBreakBeforeFirstCallParameter : 100
-PenaltyReturnTypeOnItsOwnLine : 65000
-PenaltyBreakString : 10
+PenaltyExcessCharacter : 10
 
-# These improve formatting results but require clang 3.6/7 or higher
-BreakBeforeBinaryOperators : None
-AlignAfterOpenBracket: true
-BinPackArguments : false
+AlignAfterOpenBracket: Align
 AlignOperands : true
 AlwaysBreakTemplateDeclarations : true
-Cpp11BracedListStyle : true
+AlwaysBreakAfterDefinitionReturnType : None
+PenaltyReturnTypeOnItsOwnLine : 10000
+BreakBeforeBinaryOperators : None
+
+# Indents
+IndentCaseLabels: true
+
+# Lambda body
+LambdaBodyIndentation : Signature
+
+SeparateDefinitionBlocks : Always
 
+SpaceBeforeCpp11BracedList: true
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles: false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: false
+SpacesInConditionalStatement: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b31cbe124..dbe5b3f113 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -41,7 +41,7 @@ project(RAJA LANGUAGES CXX C
   VERSION ${RAJA_LOADED})
 
 set(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/thirdparty" ${CMAKE_MODULE_PATH})
-
+set(BLT_REQUIRED_CLANGFORMAT_VERSION  "14" CACHE STRING "")
 include(cmake/SetupRajaOptions.cmake)
 
 cmake_minimum_required(VERSION 3.23)
@@ -136,6 +136,9 @@ include(cmake/SetupCompilers.cmake)
 # Macros for building executables and libraries
 include (cmake/RAJAMacros.cmake)
 
+# Configure `style` target for enforcing code style
+raja_add_code_checks()
+
 set (raja_sources
   src/AlignedRangeIndexSetBuilders.cpp
   src/DepGraphNode.cpp
diff --git a/cmake/RAJAMacros.cmake b/cmake/RAJAMacros.cmake
index c412593db7..11c4661cc1 100644
--- a/cmake/RAJAMacros.cmake
+++ b/cmake/RAJAMacros.cmake
@@ -204,3 +204,62 @@ macro(raja_add_benchmark)
     NUM_OMP_THREADS ${arg_NUM_OMP_THREADS}
     COMMAND ${TEST_DRIVER} ${arg_NAME})
 endmacro(raja_add_benchmark)
+
+##------------------------------------------------------------------------------
+## raja_add_code_checks()
+##
+## Adds code checks for all source files recursively in the RAJA repository.
+##
+## This creates the following parent build targets:
+##  check - Runs a non file changing style check and CppCheck
+##  style - In-place code formatting
+##
+## Creates various child build targets that follow this pattern:
+##  raja_<check|style>
+##  raja_<cppcheck|clangformat>_<check|style>
+##------------------------------------------------------------------------------
+macro(raja_add_code_checks)
+
+  set(options)
+  set(singleValueArgs)
+  set(multiValueArgs)
+
+  # Parse the arguments to the macro
+  cmake_parse_arguments(arg
+       "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  # Only do code checks if building raja by itself and not included in
+  # another project
+  if ("${PROJECT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}")
+      # Create file globbing expressions that only include directories that contain source
+      # TODO(bowen) Add examples, exercises and benchmark to the list below
+      set(_base_dirs "RAJA" "benchmark" "include" "src" "test")
+      set(_ext_expressions "*.cpp" "*.hpp" "*.inl"
+                           "*.cxx" "*.hxx" "*.cc" "*.c" "*.h" "*.hh")
+
+      set(_glob_expressions)
+      foreach(_exp ${_ext_expressions})
+          foreach(_base_dir ${_base_dirs})
+              list(APPEND _glob_expressions "${PROJECT_SOURCE_DIR}/${_base_dir}/${_exp}")
+          endforeach()
+      endforeach()
+
+      # Glob for list of files to run code checks on
+      set(_sources)
+      file(GLOB_RECURSE _sources ${_glob_expressions})
+
+      blt_add_code_checks(PREFIX          RAJA
+                          SOURCES         ${_sources}
+                          CLANGFORMAT_CFG_FILE ${PROJECT_SOURCE_DIR}/.clang-format
+                          CPPCHECK_FLAGS  --enable=all --inconclusive)
+
+      # Set FOLDER property for code check targets
+      foreach(_suffix clangformat_check clangformat_style clang_tidy_check clang_tidy_style)
+          set(_tgt ${arg_PREFIX}_${_suffix})
+          if(TARGET ${_tgt})
+              set_target_properties(${_tgt} PROPERTIES FOLDER "RAJA/code_checks")
+          endif()
+      endforeach()
+  endif()
+
+endmacro(raja_add_code_checks)
diff --git a/cmake/SetupRajaOptions.cmake b/cmake/SetupRajaOptions.cmake
index 9c5fb043e4..09276e18db 100644
--- a/cmake/SetupRajaOptions.cmake
+++ b/cmake/SetupRajaOptions.cmake
@@ -28,7 +28,7 @@ option(RAJA_ENABLE_FORCEINLINE_RECURSIVE "Enable Forceinline recursive (only sup
 option(RAJA_DEPRECATED_TESTS "Test deprecated features" Off)
 option(RAJA_ENABLE_BOUNDS_CHECK "Enable bounds checking in RAJA::Views/Layouts" Off)
 option(RAJA_TEST_EXHAUSTIVE "Build RAJA exhaustive tests" Off)
-option(RAJA_TEST_OPENMP_TARGET_SUBSET "Build subset of RAJA OpenMP target tests" On)
+option(RAJA_TEST_OPENMP_TARGET_SUBSET "Build subset of RAJA OpenMP target tests when it is enabled" On)
 option(RAJA_ENABLE_RUNTIME_PLUGINS "Enable support for loading plugins at runtime" Off)
 option(RAJA_ALLOW_INCONSISTENT_OPTIONS "Enable inconsistent values for ENABLE_X and RAJA_ENABLE_X options" Off)
 
diff --git a/include/RAJA/RAJA.hpp b/include/RAJA/RAJA.hpp
index 59cca4bf22..aea813237b 100644
--- a/include/RAJA/RAJA.hpp
+++ b/include/RAJA/RAJA.hpp
@@ -88,7 +88,7 @@
 #endif
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/desul.hpp"
+#include "RAJA/policy/desul.hpp"
 #endif
 
 #include "RAJA/index/IndexSet.hpp"
@@ -197,11 +197,14 @@
 
 #include "RAJA/pattern/sort.hpp"
 
-namespace RAJA {
-namespace expt{}
+namespace RAJA
+{
+namespace expt
+{}
+
 //  // provide a RAJA::expt namespace for experimental work, but bring alias
 //  // it into RAJA so it doesn't affect user code
 //  using namespace expt;
-}
+}  // namespace RAJA
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/index/IndexSet.hpp b/include/RAJA/index/IndexSet.hpp
index 1a467c8341..b12501c255 100644
--- a/include/RAJA/index/IndexSet.hpp
+++ b/include/RAJA/index/IndexSet.hpp
@@ -34,10 +34,19 @@
 namespace RAJA
 {
 
-enum PushEnd { PUSH_FRONT, PUSH_BACK };
-enum PushCopy { PUSH_COPY, PUSH_NOCOPY };
+enum PushEnd
+{
+  PUSH_FRONT,
+  PUSH_BACK
+};
 
-template <typename... TALL>
+enum PushCopy
+{
+  PUSH_COPY,
+  PUSH_NOCOPY
+};
+
+template<typename... TALL>
 class TypedIndexSet;
 
 namespace policy
@@ -52,11 +61,12 @@ namespace indexset
 /// over segments.  The second describes the policy for executing
 /// each segment.
 ///
-template <typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
+template<typename SEG_ITER_POLICY_T, typename SEG_EXEC_POLICY_T = void>
 struct ExecPolicy
     : public RAJA::make_policy_pattern_t<SEG_EXEC_POLICY_T::policy,
-                                         RAJA::Pattern::forall> {
-  using seg_it = SEG_ITER_POLICY_T;
+                                         RAJA::Pattern::forall>
+{
+  using seg_it   = SEG_ITER_POLICY_T;
   using seg_exec = SEG_EXEC_POLICY_T;
 };
 
@@ -65,7 +75,6 @@ struct ExecPolicy
 
 using policy::indexset::ExecPolicy;
 
-
 /*!
  ******************************************************************************
  *
@@ -74,10 +83,10 @@ using policy::indexset::ExecPolicy;
  *
  ******************************************************************************
  */
-template <typename T0, typename... TREST>
+template<typename T0, typename... TREST>
 class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 {
-  using PARENT = TypedIndexSet<TREST...>;
+  using PARENT               = TypedIndexSet<TREST...>;
   static const int T0_TypeId = sizeof...(TREST);
 
 public:
@@ -91,7 +100,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Construct empty index set
 #if _MSC_VER < 1910
-   // this one instance of constexpr does not work on VS2012 or VS2015
+  // this one instance of constexpr does not work on VS2012 or VS2015
   RAJA_INLINE TypedIndexSet() : PARENT() {}
 #else
   RAJA_INLINE constexpr TypedIndexSet() : PARENT() {}
@@ -99,12 +108,12 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
   //! Copy-constructor for index set
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet<T0, TREST...> const &c)
-      : PARENT((PARENT const &)c)
+  TypedIndexSet(TypedIndexSet<T0, TREST...> const& c) : PARENT((PARENT const&)c)
   {
     size_t num = c.data.size();
     data.resize(num);
-    for (size_t i = 0; i < num; ++i) {
+    for (size_t i = 0; i < num; ++i)
+    {
       data[i] = c.data[i];
     }
     // mark all as not owned by us
@@ -112,9 +121,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Copy-assignment operator for index set
-  TypedIndexSet<T0, TREST...> &operator=(const TypedIndexSet<T0, TREST...> &rhs)
+  TypedIndexSet<T0, TREST...>& operator=(const TypedIndexSet<T0, TREST...>& rhs)
   {
-    if (&rhs != this) {
+    if (&rhs != this)
+    {
       TypedIndexSet<T0, TREST...> copy(rhs);
       this->swap(copy);
     }
@@ -125,19 +135,21 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA_INLINE ~TypedIndexSet()
   {
     size_t num_seg = data.size();
-    for (size_t i = 0; i < num_seg; ++i) {
+    for (size_t i = 0; i < num_seg; ++i)
+    {
       // Only free segment of we allocated it
-      if (owner[i]) {
+      if (owner[i])
+      {
         delete data[i];
       }
     }
   }
 
   //! Swap function for copy-and-swap idiom.
-  void swap(TypedIndexSet<T0, TREST...> &other)
+  void swap(TypedIndexSet<T0, TREST...>& other)
   {
     // Swap parents data
-    PARENT::swap((PARENT &)other);
+    PARENT::swap((PARENT&)other);
     // Swap our data
     using std::swap;
     swap(data, other.data);
@@ -149,19 +161,21 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
   /// This is used to implement the == and != operators
   ///
-  template <typename P0, typename... PREST>
+  template<typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(
       size_t segid,
-      const TypedIndexSet<P0, PREST...> &other) const
+      const TypedIndexSet<P0, PREST...>& other) const
   {
     // drill down our types until we have the right type
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       // peel off T0
       return PARENT::compareSegmentById(segid, other);
     }
 
     // Check that other's segid is of type T0
-    if (!other.template checkSegmentType<T0>(segid)) {
+    if (!other.template checkSegmentType<T0>(segid))
+    {
       return false;
     }
 
@@ -170,35 +184,36 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
     return *data[offset] == other.template getSegment<T0>(segid);
   }
 
-
-  template <typename P0>
+  template<typename P0>
   RAJA_INLINE bool checkSegmentType(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       return std::is_same<T0, P0>::value;
     }
     return PARENT::template checkSegmentType<P0>(segid);
   }
 
-
   //! get specified segment by ID
-  template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t segid)
+  template<typename P0>
+  RAJA_INLINE P0& getSegment(size_t segid)
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
 
   //! get specified segment by ID
-  template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t segid) const
+  template<typename P0>
+  RAJA_INLINE P0 const& getSegment(size_t segid) const
   {
-    if (getSegmentTypes()[segid] == T0_TypeId) {
+    if (getSegmentTypes()[segid] == T0_TypeId)
+    {
       Index_type offset = getSegmentOffsets()[segid];
-      return *reinterpret_cast<P0 const *>(data[offset]);
+      return *reinterpret_cast<P0 const*>(data[offset]);
     }
     return PARENT::template getSegment<P0>(segid);
   }
@@ -230,43 +245,49 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   ///
 
 private:
-  template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &c,
-                             PushEnd pend = PUSH_BACK,
+  template<typename... CALL>
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>& c,
+                             PushEnd pend   = PUSH_BACK,
                              PushCopy pcopy = PUSH_COPY)
   {
     Index_type num = getNumSegments();
 
-    if (pend == PUSH_BACK) {
-      for (Index_type i = 0; i < num; ++i) {
+    if (pend == PUSH_BACK)
+    {
+      for (Index_type i = 0; i < num; ++i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
-    } else {
-      for (Index_type i = num-1; i > -1; --i) {
+      }
+    }
+    else
+    {
+      for (Index_type i = num - 1; i > -1; --i)
+      {
         segment_push_into(i, c, pend, pcopy);
-      } 
+      }
     }
   }
 
-
   static constexpr int value_for(PushEnd end, PushCopy copy)
   {
     return (end == PUSH_BACK) << 1 | (copy == PUSH_COPY);
   }
 
 public:
-  template <typename... CALL>
+  template<typename... CALL>
   RAJA_INLINE void segment_push_into(size_t segid,
-                                     TypedIndexSet<CALL...> &c,
-                                     PushEnd pend = PUSH_BACK,
+                                     TypedIndexSet<CALL...>& c,
+                                     PushEnd pend   = PUSH_BACK,
                                      PushCopy pcopy = PUSH_COPY)
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
       PARENT::segment_push_into(segid, c, pend, pcopy);
       return;
     }
     Index_type offset = getSegmentOffsets()[segid];
-    switch (value_for(pend, pcopy)) {
+    switch (value_for(pend, pcopy))
+    {
       case value_for(PUSH_BACK, PUSH_COPY):
         c.push_back(*data[offset]);
         break;
@@ -282,41 +303,43 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
     }
   }
 
-
   //! Add segment to back end of index set without making a copy.
-  template <typename Tnew>
-  RAJA_INLINE void push_back_nocopy(Tnew *val)
+  template<typename Tnew>
+  RAJA_INLINE void push_back_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_BACK, PUSH_NOCOPY);
   }
 
   //! Add segment to front end of index set without making a copy.
-  template <typename Tnew>
-  RAJA_INLINE void push_front_nocopy(Tnew *val)
+  template<typename Tnew>
+  RAJA_INLINE void push_front_nocopy(Tnew* val)
   {
     push_internal(val, PUSH_FRONT, PUSH_NOCOPY);
   }
 
   //! Add copy of segment to back end of index set.
-  template <typename Tnew>
-  RAJA_INLINE void push_back(Tnew &&val)
+  template<typename Tnew>
+  RAJA_INLINE void push_back(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_BACK, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_BACK, PUSH_COPY);
   }
 
   //! Add copy of segment to front end of index set.
-  template <typename Tnew>
-  RAJA_INLINE void push_front(Tnew &&val)
+  template<typename Tnew>
+  RAJA_INLINE void push_front(Tnew&& val)
   {
-    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)), PUSH_FRONT, PUSH_COPY);
+    push_internal(new typename std::decay<Tnew>::type(std::forward<Tnew>(val)),
+                  PUSH_FRONT, PUSH_COPY);
   }
 
   //! Return total length -- sum of lengths of all segments
   RAJA_INLINE size_t getLength() const
   {
     size_t total = PARENT::getLength();
-    size_t num = data.size();
-    for (size_t i = 0; i < num; ++i) {
+    size_t num   = data.size();
+    for (size_t i = 0; i < num; ++i)
+    {
       total += data[i]->size();
     }
     return total;
@@ -328,7 +351,6 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
     return data.size() + PARENT::getNumSegments();
   }
 
-
   ///
   /// Calls the operator "body" with the segment stored at segid.
   ///
@@ -338,14 +360,14 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// The "args..." are passed-thru to the body as arguments AFTER the segment.
   ///
   RAJA_SUPPRESS_HD_WARN
-  template <typename BODY, typename... ARGS>
+  template<typename BODY, typename... ARGS>
   RAJA_HOST_DEVICE void segmentCall(size_t segid,
-                                    BODY &&body,
-                                    ARGS &&... args) const
+                                    BODY&& body,
+                                    ARGS&&... args) const
   {
-    if (getSegmentTypes()[segid] != T0_TypeId) {
-      PARENT::segmentCall(segid,
-                          std::forward<BODY>(body),
+    if (getSegmentTypes()[segid] != T0_TypeId)
+    {
+      PARENT::segmentCall(segid, std::forward<BODY>(body),
                           std::forward<ARGS>(args)...);
       return;
     }
@@ -355,9 +377,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Internal logic to add a new segment -- catch invalid type insertion
-  template <typename Tnew>
-  RAJA_INLINE void push_internal(Tnew *val,
-                                 PushEnd pend = PUSH_BACK,
+  template<typename Tnew>
+  RAJA_INLINE void push_internal(Tnew* val,
+                                 PushEnd pend   = PUSH_BACK,
                                  PushCopy pcopy = PUSH_COPY)
   {
     static_assert(sizeof...(TREST) > 0, "Invalid type for this TypedIndexSet");
@@ -365,15 +387,16 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Internal logic to add a new segment
-  RAJA_INLINE void push_internal(T0 *val,
-                                 PushEnd pend = PUSH_BACK,
+  RAJA_INLINE void push_internal(T0* val,
+                                 PushEnd pend   = PUSH_BACK,
                                  PushCopy pcopy = PUSH_COPY)
   {
     data.push_back(val);
     owner.push_back(pcopy == PUSH_COPY);
 
     // Determine if we push at the front or back of the segment list
-    if (pend == PUSH_BACK) {
+    if (pend == PUSH_BACK)
+    {
       // Store the segment type
       getSegmentTypes().push_back(T0_TypeId);
 
@@ -384,7 +407,9 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       size_t icount = val->size();
       getSegmentIcounts().push_back(getTotalLength());
       increaseTotalLength(icount);
-    } else {
+    }
+    else
+    {
       // Store the segment type
       getSegmentTypes().push_front(T0_TypeId);
 
@@ -394,7 +419,8 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
       // Store the segment icount
       getSegmentIcounts().push_front(0);
       size_t icount = val->size();
-      for (size_t i = 1; i < getSegmentIcounts().size(); ++i) {
+      for (size_t i = 1; i < getSegmentIcounts().size(); ++i)
+      {
         getSegmentIcounts()[i] += icount;
       }
       increaseTotalLength(icount);
@@ -402,7 +428,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Returns the number of indices (the total icount of segments
-  RAJA_INLINE Index_type &getTotalLength() { return PARENT::getTotalLength(); }
+  RAJA_INLINE Index_type& getTotalLength() { return PARENT::getTotalLength(); }
 
   //! set total length of the indexset
   RAJA_INLINE void setTotalLength(int n) { return PARENT::setTotalLength(n); }
@@ -437,9 +463,10 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   {
     TypedIndexSet<T0, TREST...> retVal;
 
-    int minSeg = RAJA::operators::maximum<int>{}(0, begin);
-    int maxSeg = RAJA::operators::minimum<int>{}(end, getNumSegments());
-    for (int i = minSeg; i < maxSeg; ++i) {
+    int minSeg = RAJA::operators::maximum<int> {}(0, begin);
+    int maxSeg = RAJA::operators::minimum<int> {}(end, getNumSegments());
+    for (int i = minSeg; i < maxSeg; ++i)
+    {
       segment_push_into(i, retVal, PUSH_BACK, PUSH_NOCOPY);
     }
     return retVal;
@@ -452,13 +479,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// This TypedIndexSet will not change and the created "slice" into it
   /// will not own any of its segments.
   ///
-  TypedIndexSet<T0, TREST...> createSlice(const int *segIds, int len)
+  TypedIndexSet<T0, TREST...> createSlice(const int* segIds, int len)
   {
     TypedIndexSet<T0, TREST...> retVal;
 
     int numSeg = getNumSegments();
-    for (int i = 0; i < len; ++i) {
-      if (segIds[i] >= 0 && segIds[i] < numSeg) {
+    for (int i = 0; i < len; ++i)
+    {
+      if (segIds[i] >= 0 && segIds[i] < numSeg)
+      {
         segment_push_into(segIds[i], retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -475,13 +504,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// The object must provide methods begin(), end(), and its
   /// iterator type must de-reference to an integral value.
   ///
-  template <typename T>
-  TypedIndexSet<T0, TREST...> createSlice(const T &segIds)
+  template<typename T>
+  TypedIndexSet<T0, TREST...> createSlice(const T& segIds)
   {
     TypedIndexSet<T0, TREST...> retVal;
     int numSeg = getNumSegments();
-    for (auto &seg : segIds) {
-      if (seg >= 0 && seg < numSeg) {
+    for (auto& seg : segIds)
+    {
+      if (seg >= 0 && seg < numSeg)
+      {
         segment_push_into(seg, retVal, PUSH_BACK, PUSH_NOCOPY);
       }
     }
@@ -492,7 +523,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   void setSegmentInterval(size_t interval_id, int begin, int end)
   {
     m_seg_interval_begin[interval_id] = begin;
-    m_seg_interval_end[interval_id] = end;
+    m_seg_interval_end[interval_id]   = end;
   }
 
   //! get lower bound of segment identified with interval_id
@@ -509,37 +540,37 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
 
 protected:
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_type
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return PARENT::getSegmentTypes();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the mapping of  segment_index -> segment_offset
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return PARENT::getSegmentOffsets();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return PARENT::getSegmentIcounts();
   }
 
   //! Returns the icount of segments
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return PARENT::getSegmentIcounts();
   }
@@ -551,14 +582,16 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   /// Note: method does not check equality of anything other than segment
   ///       types and indices; e.g., dependency info not checked.
   ///
-  template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...> &other) const
+  template<typename P0, typename... PREST>
+  RAJA_INLINE bool operator==(const TypedIndexSet<P0, PREST...>& other) const
   {
     size_t num_seg = getNumSegments();
     if (num_seg != other.getNumSegments()) return false;
 
-    for (size_t segid = 0; segid < num_seg; ++segid) {
-      if (!compareSegmentById(segid, other)) {
+    for (size_t segid = 0; segid < num_seg; ++segid)
+    {
+      if (!compareSegmentById(segid, other))
+      {
         return false;
       }
     }
@@ -566,15 +599,15 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   }
 
   //! Inequality operator returns true if any segment is not equal, else false.
-  template <typename P0, typename... PREST>
-  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...> &other) const
+  template<typename P0, typename... PREST>
+  RAJA_INLINE bool operator!=(const TypedIndexSet<P0, PREST...>& other) const
   {
     return (!(*this == other));
   }
 
 private:
   //! vector of TypedIndexSet data objects of type T0
-  RAJA::RAJAVec<T0 *> data;
+  RAJA::RAJAVec<T0*> data;
 
   //! vector indicating which segments are owned by the TypedIndexSet
   RAJA::RAJAVec<Index_type> owner;
@@ -586,8 +619,7 @@ class TypedIndexSet<T0, TREST...> : public TypedIndexSet<TREST...>
   RAJA::RAJAVec<Index_type> m_seg_interval_end;
 };
 
-
-template <>
+template<>
 class TypedIndexSet<>
 {
 public:
@@ -603,16 +635,16 @@ class TypedIndexSet<>
 
   //! Copy-constructor.
   RAJA_INLINE
-  TypedIndexSet(TypedIndexSet const &c)
+  TypedIndexSet(TypedIndexSet const& c)
   {
-    segment_types = c.segment_types;
+    segment_types   = c.segment_types;
     segment_offsets = c.segment_offsets;
     segment_icounts = c.segment_icounts;
-    m_len = c.m_len;
+    m_len           = c.m_len;
   }
 
   //! Swap function for copy-and-swap idiom (deep copy).
-  void swap(TypedIndexSet &other)
+  void swap(TypedIndexSet& other)
   {
     using std::swap;
     swap(segment_types, other.segment_types);
@@ -624,8 +656,8 @@ class TypedIndexSet<>
 protected:
   RAJA_INLINE static size_t getNumTypes() { return 0; }
 
-  template <typename T>
-  RAJA_INLINE constexpr bool isValidSegmentType(T const &) const
+  template<typename T>
+  RAJA_INLINE constexpr bool isValidSegmentType(T const&) const
   {
     // Segment type wasn't found
     return false;
@@ -635,89 +667,85 @@ class TypedIndexSet<>
 
   RAJA_INLINE static size_t getLength() { return 0; }
 
-  template <typename BODY, typename... ARGS>
+  template<typename BODY, typename... ARGS>
   RAJA_INLINE void segmentCall(size_t, BODY, ARGS...) const
-  {
-  }
+  {}
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentTypes()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentTypes()
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentTypes() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentTypes() const
   {
     return segment_types;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentOffsets()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentOffsets()
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentOffsets() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentOffsets() const
   {
     return segment_offsets;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> &getSegmentIcounts()
+  RAJA_INLINE RAJA::RAJAVec<Index_type>& getSegmentIcounts()
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE RAJA::RAJAVec<Index_type> const &getSegmentIcounts() const
+  RAJA_INLINE RAJA::RAJAVec<Index_type> const& getSegmentIcounts() const
   {
     return segment_icounts;
   }
 
-  RAJA_INLINE Index_type &getTotalLength() { return m_len; }
+  RAJA_INLINE Index_type& getTotalLength() { return m_len; }
 
   RAJA_INLINE void setTotalLength(int n) { m_len = n; }
 
   RAJA_INLINE void increaseTotalLength(int n) { m_len += n; }
 
-  template <typename P0, typename... PREST>
+  template<typename P0, typename... PREST>
   RAJA_INLINE bool compareSegmentById(size_t,
-                                      const TypedIndexSet<P0, PREST...> &) const
+                                      const TypedIndexSet<P0, PREST...>&) const
   {
     return false;
   }
 
-  template <typename P0>
+  template<typename P0>
   RAJA_INLINE bool checkSegmentType(size_t) const
   {
     return false;
   }
 
-  template <typename P0>
-  RAJA_INLINE P0 &getSegment(size_t)
+  template<typename P0>
+  RAJA_INLINE P0& getSegment(size_t)
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
-  template <typename P0>
-  RAJA_INLINE P0 const &getSegment(size_t) const
+  template<typename P0>
+  RAJA_INLINE P0 const& getSegment(size_t) const
   {
-    return *((P0 *)(this - this));
+    return *((P0*)(this - this));
   }
 
-  template <typename... CALL>
-  RAJA_INLINE void push_into(TypedIndexSet<CALL...> &, PushEnd, PushCopy) const
-  {
-  }
+  template<typename... CALL>
+  RAJA_INLINE void push_into(TypedIndexSet<CALL...>&, PushEnd, PushCopy) const
+  {}
 
-  template <typename... CALL>
+  template<typename... CALL>
   RAJA_INLINE void segment_push_into(size_t,
-                                     TypedIndexSet<CALL...> &,
+                                     TypedIndexSet<CALL...>&,
                                      PushEnd,
                                      PushCopy) const
-  {
-  }
+  {}
 
-  template <typename Tnew>
-  RAJA_INLINE void push(Tnew const &, PushEnd, PushCopy)
-  {
-  }
+  template<typename Tnew>
+  RAJA_INLINE void push(Tnew const&, PushEnd, PushCopy)
+  {}
 
 public:
   using iterator = Iterators::numeric_iterator<Index_type>;
@@ -756,19 +784,20 @@ class TypedIndexSet<>
   Index_type m_len;
 };
 
-
 namespace type_traits
 {
 
-template <typename T>
+template<typename T>
 struct is_index_set
-    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::TypedIndexSet,
+                                            typename std::decay<T>::type>
+{};
 
-template <typename T>
+template<typename T>
 struct is_indexset_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::ExecPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetBuilders.hpp b/include/RAJA/index/IndexSetBuilders.hpp
index 543524be01..075aecd1d1 100644
--- a/include/RAJA/index/IndexSetBuilders.hpp
+++ b/include/RAJA/index/IndexSetBuilders.hpp
@@ -37,13 +37,13 @@ namespace RAJA
  * \brief Generate an index set with aligned Range segments and List segments,
  *        as needed, from given array of indices.
  *
- *        Routine does no error-checking on argements and assumes 
+ *        Routine does no error-checking on argements and assumes
  *        RAJA::Index_type array contains valid indices.
  *
- *  \param iset reference to index set generated with aligned range segments 
+ *  \param iset reference to index set generated with aligned range segments
  *         and list segments. Method assumes index set is empty (no segments).
- *  \param work_res camp resource object that identifies the memory space in 
- *         which list segment index data will live (passed to list segment 
+ *  \param work_res camp resource object that identifies the memory space in
+ *         which list segment index data will live (passed to list segment
  *         ctor).
  *  \param indices_in pointer to start of input array of indices.
  *  \param length size of input index array.
@@ -79,37 +79,36 @@ void RAJASHAREDDLL_API buildIndexSetAligned(
  ******************************************************************************
  *
  * \brief Generate a lock-free "block" index set (planar division) containing
- *        range segments. 
+ *        range segments.
  *
- *        The method chunks a fastDim x midDim x slowDim mesh into blocks that 
+ *        The method chunks a fastDim x midDim x slowDim mesh into blocks that
  *        can be dependency-scheduled, removing need for lock constructs.
  *
  *  \param iset reference to index set generated with range segments.
- *         Method assumes index set is empty (no segments). 
+ *         Method assumes index set is empty (no segments).
  *  \param fastDim "fast" block dimension (see above).
  *  \param midDim  "mid" block dimension (see above).
  *  \param slowDim "slow" block dimension (see above).
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim);
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim);
 
 /*!
  ******************************************************************************
  *
  * \brief Generate a lock-free "color" index set containing range and list
  *        segments.
- * 
- *        TThe domain-set is colored based on connectivity to the range-set. 
- *        All elements in each segment are independent, and no two segments 
+ *
+ *        TThe domain-set is colored based on connectivity to the range-set.
+ *        All elements in each segment are independent, and no two segments
  *        can be executed in parallel.
  *
- * \param iset reference to index set generated. Method assumes index set 
- *        is empty (no segments). 
+ * \param iset reference to index set generated. Method assumes index set
+ *        is empty (no segments).
  * \param work_res camp resource object that identifies the memory space in
  *         which list segment index data will live (passed to list segment
  *         ctor).
@@ -123,7 +122,7 @@ void buildLockFreeColorIndexset(
     int numEntity,
     int numRangePerDomain,
     int numEntityRange,
-    RAJA::Index_type* elemPermutation = nullptr,
+    RAJA::Index_type* elemPermutation  = nullptr,
     RAJA::Index_type* ielemPermutation = nullptr);
 
 }  // namespace RAJA
diff --git a/include/RAJA/index/IndexSetUtils.hpp b/include/RAJA/index/IndexSetUtils.hpp
index 4baea450fc..a6733e6d68 100644
--- a/include/RAJA/index/IndexSetUtils.hpp
+++ b/include/RAJA/index/IndexSetUtils.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 //@{
 //!   @name Methods to gather indices of segment or index set into a container.
 //!
-//!   For each method, the given container must be templated on a data type, 
-//!   have default and copy ctors, push_back method, and value_type. Is is 
-//!   assumed that the container data type and segment or index set data type 
-//!   are compatible in the sense that the index set type can be converted to 
+//!   For each method, the given container must be templated on a data type,
+//!   have default and copy ctors, push_back method, and value_type. Is is
+//!   assumed that the container data type and segment or index set data type
+//!   are compatible in the sense that the index set type can be converted to
 //!   the container data type.
 
 /*!
@@ -44,16 +44,15 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename CONTAINER_T, typename... SEG_TYPES>
+template<typename CONTAINER_T, typename... SEG_TYPES>
 RAJA_INLINE void getIndices(CONTAINER_T& con,
                             const TypedIndexSet<SEG_TYPES...>& iset)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) {
+        tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -64,15 +63,13 @@ RAJA_INLINE void getIndices(CONTAINER_T& con,
  *
  ******************************************************************************
  */
-template <typename CONTAINER_T, typename SEGMENT_T>
+template<typename CONTAINER_T, typename SEGMENT_T>
 RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
+    tcon.push_back(idx);
+  });
   con = tcon;
 }
 
@@ -84,17 +81,16 @@ RAJA_INLINE void getIndices(CONTAINER_T& con, const SEGMENT_T& seg)
  *
  ******************************************************************************
  */
-template <typename CONTAINER_T, typename... SEG_TYPES, typename CONDITIONAL>
+template<typename CONTAINER_T, typename... SEG_TYPES, typename CONDITIONAL>
 RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        const TypedIndexSet<SEG_TYPES...>& iset,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<ExecPolicy<seq_segit, seq_exec> >(iset,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<ExecPolicy<seq_segit, seq_exec>>(
+      iset, [&](typename CONTAINER_T::value_type idx) {
+        if (conditional(idx)) tcon.push_back(idx);
+      });
   con = tcon;
 }
 
@@ -106,17 +102,15 @@ RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
  *
  ******************************************************************************
  */
-template <typename CONTAINER_T, typename SEGMENT_T, typename CONDITIONAL>
+template<typename CONTAINER_T, typename SEGMENT_T, typename CONDITIONAL>
 RAJA_INLINE void getIndicesConditional(CONTAINER_T& con,
                                        const SEGMENT_T& seg,
                                        CONDITIONAL conditional)
 {
   CONTAINER_T tcon;
-  forall<seq_exec>(seg,
-    [&](typename CONTAINER_T::value_type idx) {
-      if (conditional(idx)) tcon.push_back(idx);
-    }
-  );
+  forall<seq_exec>(seg, [&](typename CONTAINER_T::value_type idx) {
+    if (conditional(idx)) tcon.push_back(idx);
+  });
   con = tcon;
 }
 
diff --git a/include/RAJA/index/IndexValue.hpp b/include/RAJA/index/IndexValue.hpp
index 44fa143445..15e64c07a2 100644
--- a/include/RAJA/index/IndexValue.hpp
+++ b/include/RAJA/index/IndexValue.hpp
@@ -28,8 +28,8 @@
 namespace RAJA
 {
 
-struct IndexValueBase {
-};
+struct IndexValueBase
+{};
 
 /*!
  * \brief Strongly typed "integer" class.
@@ -43,17 +43,18 @@ struct IndexValueBase {
  *
  * Yes, this uses the curiously-recurring template pattern.
  */
-template <typename TYPE, typename VALUE = RAJA::Index_type>
-struct IndexValue : public IndexValueBase {
+template<typename TYPE, typename VALUE = RAJA::Index_type>
+struct IndexValue : public IndexValueBase
+{
 
   using value_type = VALUE;
 
   //! Default constructor initializes value to 0.
-  RAJA_INLINE constexpr IndexValue() = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue const &) = default;
-  constexpr RAJA_INLINE IndexValue(IndexValue &&) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue const &) = default;
-  RAJA_INLINE IndexValue &operator=(IndexValue &&) = default;
+  RAJA_INLINE constexpr IndexValue()                   = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue const&)  = default;
+  constexpr RAJA_INLINE IndexValue(IndexValue&&)       = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue const&) = default;
+  RAJA_INLINE IndexValue& operator=(IndexValue&&)      = default;
 
   /*!
    * \brief Explicit constructor.
@@ -61,14 +62,13 @@ struct IndexValue : public IndexValueBase {
    */
   RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit IndexValue(value_type v)
       : value(v)
-  {
-  }
+  {}
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator*() { return value; }
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator*() { return value; }
 
   //! Dereference provides cast-to-integer.
-  RAJA_HOST_DEVICE RAJA_INLINE const value_type &operator*() const
+  RAJA_HOST_DEVICE RAJA_INLINE const value_type& operator*() const
   {
     return value;
   }
@@ -82,10 +82,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator++()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator++()
   {
     value++;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! postdecrement -- returns a copy
@@ -97,10 +97,10 @@ struct IndexValue : public IndexValueBase {
   }
 
   //! preincrement stored index
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator--()
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator--()
   {
     value--;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   //! addition to underlying index from an Index_type
@@ -163,52 +163,52 @@ struct IndexValue : public IndexValueBase {
     return TYPE(value % a.value);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(value_type x)
   {
     value += x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator+=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator+=(TYPE x)
   {
     value += x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(value_type x)
   {
     value -= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator-=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator-=(TYPE x)
   {
     value -= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(value_type x)
   {
     value *= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator*=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator*=(TYPE x)
   {
     value *= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(value_type x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(value_type x)
   {
     value /= x;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE TYPE &operator/=(TYPE x)
+  RAJA_HOST_DEVICE RAJA_INLINE TYPE& operator/=(TYPE x)
   {
     value /= x.value;
-    return static_cast<TYPE &>(*this);
+    return static_cast<TYPE&>(*this);
   }
 
   RAJA_HOST_DEVICE RAJA_INLINE bool operator<(value_type x) const
@@ -282,12 +282,13 @@ struct IndexValue : public IndexValueBase {
 namespace internal
 {
 
-template <typename TO, typename FROM>
+template<typename TO, typename FROM>
 constexpr RAJA_HOST_DEVICE RAJA_INLINE TO convertIndex_helper(FROM const val)
 {
   return TO(val);
 }
-template <typename TO, typename FROM>
+
+template<typename TO, typename FROM>
 constexpr RAJA_HOST_DEVICE RAJA_INLINE TO
 convertIndex_helper(typename FROM::IndexValueType const val)
 {
@@ -302,19 +303,18 @@ convertIndex_helper(typename FROM::IndexValueType const val)
  * convert it to another type, possibly another Index or an int.
  *
  */
-template <typename TO, typename FROM>
+template<typename TO, typename FROM>
 constexpr RAJA_HOST_DEVICE RAJA_INLINE TO convertIndex(FROM const val)
 {
   return internal::convertIndex_helper<TO, FROM>(val);
 }
 
-
 /*!
  * \brief Function that strips the strongly typed Index<> and returns its
  * underlying value_type value.
  */
 // This version is enabled if FROM is a strongly typed class
-template <typename FROM>
+template<typename FROM>
 constexpr RAJA_HOST_DEVICE RAJA_INLINE
     typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value,
                             typename FROM::value_type>::type
@@ -322,10 +322,11 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
 {
   return *val;
 }
+
 /*
  * enabled if FROM is not a strongly typed class
  */
-template <typename FROM>
+template<typename FROM>
 constexpr RAJA_HOST_DEVICE RAJA_INLINE
     typename std::enable_if<!std::is_base_of<IndexValueBase, FROM>::value,
                             FROM>::type
@@ -334,18 +335,22 @@ constexpr RAJA_HOST_DEVICE RAJA_INLINE
   return val;
 }
 
-namespace internal{
+namespace internal
+{
 template<typename FROM, typename Enable = void>
-struct StripIndexTypeT {
-    using type = FROM;
+struct StripIndexTypeT
+{
+  using type = FROM;
 };
 
 template<typename FROM>
-struct StripIndexTypeT<FROM, typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
+struct StripIndexTypeT<
+    FROM,
+    typename std::enable_if<std::is_base_of<IndexValueBase, FROM>::value>::type>
 {
-    using type = typename FROM::value_type;
+  using type = typename FROM::value_type;
 };
-} // namespace internal
+}  // namespace internal
 
 /*!
  * \brief Strips a strongly typed index to its underlying type
@@ -363,11 +368,10 @@ using strip_index_type_t = typename internal::StripIndexTypeT<FROM>::type;
  * \param FROM the original type
  */
 template<typename FROM>
-using make_signed_t = typename std::conditional < 
-                                  std::is_floating_point<FROM>::value,
-                                    std::common_type<FROM>,
-                                    std::make_signed<FROM>
-                               >::type::type;
+using make_signed_t =
+    typename std::conditional<std::is_floating_point<FROM>::value,
+                              std::common_type<FROM>,
+                              std::make_signed<FROM>>::type::type;
 
 }  // namespace RAJA
 
@@ -376,19 +380,18 @@ using make_signed_t = typename std::conditional <
  * \param TYPE the name of the type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE(TYPE, NAME)                                 \
-  class TYPE : public ::RAJA::IndexValue<TYPE>                       \
-  {                                                                  \
-    using parent = ::RAJA::IndexValue<TYPE>;                         \
-                                                                     \
-  public:                                                            \
-    using IndexValueType = TYPE;                                     \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}    \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v) \
-        : parent::IndexValue(v)                                      \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE(TYPE, NAME)                                           \
+  class TYPE : public ::RAJA::IndexValue<TYPE>                                 \
+  {                                                                            \
+    using parent = ::RAJA::IndexValue<TYPE>;                                   \
+                                                                               \
+  public:                                                                      \
+    using IndexValueType = TYPE;                                               \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE() : parent::IndexValue() {}              \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(::RAJA::Index_type v)           \
+        : parent::IndexValue(v)                                                \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 /*!
@@ -397,17 +400,17 @@ using make_signed_t = typename std::conditional <
  * \param IDXT the index types value type
  * \param NAME a string literal to identify this index type
  */
-#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                         \
-  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                 \
-  {                                                                  \
-  public:                                                            \
-    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                              \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue() {}               \
-    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)               \
-        : RAJA::IndexValue<TYPE,IDXT>::IndexValue(v)                 \
-    {                                                                \
-    }                                                                \
-    static inline std::string getName() { return NAME; }             \
+#define RAJA_INDEX_VALUE_T(TYPE, IDXT, NAME)                                   \
+  class TYPE : public ::RAJA::IndexValue<TYPE, IDXT>                           \
+  {                                                                            \
+  public:                                                                      \
+    RAJA_HOST_DEVICE RAJA_INLINE TYPE()                                        \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue()                           \
+    {}                                                                         \
+    RAJA_HOST_DEVICE RAJA_INLINE explicit TYPE(IDXT v)                         \
+        : RAJA::IndexValue<TYPE, IDXT>::IndexValue(v)                          \
+    {}                                                                         \
+    static inline std::string getName() { return NAME; }                       \
   };
 
 #endif
diff --git a/include/RAJA/index/ListSegment.hpp b/include/RAJA/index/ListSegment.hpp
index adee46053c..54b2d7e1c4 100644
--- a/include/RAJA/index/ListSegment.hpp
+++ b/include/RAJA/index/ListSegment.hpp
@@ -81,11 +81,10 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT>
+template<typename StorageT>
 class TypedListSegment
 {
 public:
-
   //@{
   //!   @name Types used in implementation based on template parameter.
 
@@ -111,7 +110,7 @@ class TypedListSegment
    * \param values array of indices defining iteration space of segment
    * \param length number of indices
    * \param resource camp resource defining memory space where index data live
-   * \param owned optional enum value indicating whether segment owns indices 
+   * \param owned optional enum value indicating whether segment owns indices
    * (Owned or Unowned). Default is Owned.
    *
    * If 'Unowned' is passed as last argument, the segment will not own its
@@ -121,7 +120,10 @@ class TypedListSegment
                    Index_type length,
                    camp::resources::Resource resource,
                    IndexOwnership owned = Owned)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(0)
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(0)
   {
     initIndexData(values, length, resource, owned);
   }
@@ -138,33 +140,37 @@ class TypedListSegment
    *
    * Constructor assumes container data lives in host memory space.
    */
-  template <typename Container>
+  template<typename Container>
   TypedListSegment(const Container& container,
                    camp::resources::Resource resource)
-    : m_resource(nullptr), m_owned(Unowned), m_data(nullptr), m_size(container.size())
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(nullptr),
+        m_size(container.size())
   {
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
 
-      camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
       value_type* tmp = host_res.allocate<value_type>(m_size);
 
-      auto dest = tmp;
-      auto src = container.begin();
+      auto dest      = tmp;
+      auto src       = container.begin();
       auto const end = container.end();
-      while (src != end) {
+      while (src != end)
+      {
         *dest = *src;
         ++dest;
         ++src;
       }
 
       m_resource = new camp::resources::Resource(resource);
-      m_data = m_resource->allocate<value_type>(m_size);
+      m_data     = m_resource->allocate<value_type>(m_size);
       m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
       m_owned = Owned;
 
       host_res.deallocate(tmp);
-
     }
   }
 
@@ -175,10 +181,11 @@ class TypedListSegment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment(const TypedListSegment& other)
-    : m_resource(nullptr),
-      m_owned(Unowned), m_data(other.m_data), m_size(other.m_size)
-  {
-  }
+      : m_resource(nullptr),
+        m_owned(Unowned),
+        m_data(other.m_data),
+        m_size(other.m_size)
+  {}
 
   //! Copy assignment for list segment
   //  As this may be called from a lambda in a
@@ -187,59 +194,59 @@ class TypedListSegment
   {
     clear();
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_data = other.m_data;
-    m_size = other.m_size;
+    m_owned    = Unowned;
+    m_data     = other.m_data;
+    m_size     = other.m_size;
   }
 
-    //! move assignment for list segment
+  //! move assignment for list segment
   //  As this may be called from a lambda in a
   //  RAJA method we perform a shallow copy
   RAJA_HOST_DEVICE TypedListSegment& operator=(TypedListSegment&& rhs)
   {
     clear();
     m_resource = rhs.m_resource;
-    m_owned = rhs.m_owned;
-    m_data = rhs.m_data;
-    m_size = rhs.m_size;
+    m_owned    = rhs.m_owned;
+    m_data     = rhs.m_data;
+    m_size     = rhs.m_size;
 
     rhs.m_resource = nullptr;
-    rhs.m_owned = Unowned;
-    rhs.m_data = nullptr;
-    rhs.m_size = 0;
+    rhs.m_owned    = Unowned;
+    rhs.m_data     = nullptr;
+    rhs.m_size     = 0;
   }
 
   //! Move constructor for list segment
   RAJA_HOST_DEVICE TypedListSegment(TypedListSegment&& rhs)
-    : m_resource(rhs.m_resource),
-      m_owned(rhs.m_owned), m_data(rhs.m_data), m_size(rhs.m_size)
+      : m_resource(rhs.m_resource),
+        m_owned(rhs.m_owned),
+        m_data(rhs.m_data),
+        m_size(rhs.m_size)
   {
-    rhs.m_owned = Unowned;
+    rhs.m_owned    = Unowned;
     rhs.m_resource = nullptr;
-    rhs.m_size = 0;
-    rhs.m_data = nullptr;
+    rhs.m_size     = 0;
+    rhs.m_data     = nullptr;
   }
 
   //! List segment destructor
-  RAJA_HOST_DEVICE ~TypedListSegment()
-  {
-    clear();
-  }
+  RAJA_HOST_DEVICE ~TypedListSegment() { clear(); }
 
   //! Clear method to be called
   RAJA_HOST_DEVICE void clear()
   {
 
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_data != nullptr && m_owned == Owned) {
+    if (m_data != nullptr && m_owned == Owned)
+    {
       m_resource->deallocate(m_data);
       delete m_resource;
     }
 #endif
-    m_data = nullptr;
+    m_data     = nullptr;
     m_resource = nullptr;
-    m_owned = Unowned;
-    m_size = 0;
+    m_owned    = Unowned;
+    m_size     = 0;
   }
 
   //@}
@@ -345,32 +352,35 @@ class TypedListSegment
   {
 
     // empty list segment
-    if (len <= 0 || container == nullptr) {
-      m_data = nullptr;
-      m_size = 0;
+    if (len <= 0 || container == nullptr)
+    {
+      m_data  = nullptr;
+      m_size  = 0;
       m_owned = Unowned;
       return;
     }
 
     // some non-zero size -- initialize accordingly
-    m_size = len;
+    m_size  = len;
     m_owned = container_own;
-    if (m_owned == Owned) {
+    if (m_owned == Owned)
+    {
 
-        m_resource = new camp::resources::Resource(resource_);
+      m_resource = new camp::resources::Resource(resource_);
 
-        camp::resources::Resource host_res{camp::resources::Host()};
+      camp::resources::Resource host_res {camp::resources::Host()};
 
-        value_type* tmp = host_res.allocate<value_type>(m_size);
+      value_type* tmp = host_res.allocate<value_type>(m_size);
 
-        for (Index_type i = 0; i < m_size; ++i) {
-          tmp[i] = container[i];
-        }
+      for (Index_type i = 0; i < m_size; ++i)
+      {
+        tmp[i] = container[i];
+      }
 
-        m_data = m_resource->allocate<value_type>(m_size);
-        m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
+      m_data = m_resource->allocate<value_type>(m_size);
+      m_resource->memcpy(m_data, tmp, sizeof(value_type) * m_size);
 
-        host_res.deallocate(tmp);
+      host_res.deallocate(tmp);
 
       return;
     }
@@ -380,9 +390,8 @@ class TypedListSegment
     m_data = const_cast<value_type*>(container);
   }
 
-
   // Copy of camp resource passed to ctor
-  camp::resources::Resource *m_resource;
+  camp::resources::Resource* m_resource;
 
   // Ownership flag to guide data copying/management
   IndexOwnership m_owned;
@@ -403,7 +412,7 @@ namespace std
 {
 
 //! Specialization of std::swap for TypedListSegment
-template <typename StorageT>
+template<typename StorageT>
 RAJA_INLINE void swap(RAJA::TypedListSegment<StorageT>& a,
                       RAJA::TypedListSegment<StorageT>& b)
 {
diff --git a/include/RAJA/index/RangeSegment.hpp b/include/RAJA/index/RangeSegment.hpp
index a41959c583..35a708766c 100644
--- a/include/RAJA/index/RangeSegment.hpp
+++ b/include/RAJA/index/RangeSegment.hpp
@@ -50,10 +50,10 @@ namespace RAJA
  *
  * NOTE: TypedRangeSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of 
+ * NOTE: TypedRangeSegment supports negative indices; e.g., an interval of
  *       indices [-5, 3).
  *
- * NOTE: Proper handling of indices strides requires that StorageT is a 
+ * NOTE: Proper handling of indices strides requires that StorageT is a
  *       signed type.
  *
  * Usage:
@@ -92,15 +92,19 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeSegment {
+template<typename StorageT,
+         typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeSegment
+{
 
-  // 
+  //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
-  // 
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeSegment Type must be non floating point.");
+  //
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -117,20 +121,21 @@ struct TypedRangeSegment {
   //@}
 
   //@{
-  //!   @name Constructors, destructor, and copy assignment. 
+  //!   @name Constructors, destructor, and copy assignment.
 
   /*!
    * \brief Construct a range segment repreenting the interval [begin, end)
-   * 
+   *
    * \param begin start value (inclusive) for the range
    * \param end end value (exclusive) for the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
-  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin, StripStorageT end)
-      : m_begin(iterator(begin)), 
+
+  RAJA_HOST_DEVICE constexpr TypedRangeSegment(StripStorageT begin,
+                                               StripStorageT end)
+      : m_begin(iterator(begin)),
         m_end(begin > end ? m_begin : iterator(end))
-  {
-  }
+  {}
 
   //! Disable compiler generated constructor
   RAJA_HOST_DEVICE TypedRangeSegment() = delete;
@@ -187,7 +192,7 @@ struct TypedRangeSegment {
    * \brief Compare this segment to another for inequality
    *
    * \return true if begin or end does not match, else false
-   */ 
+   */
   RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeSegment const& o) const
   {
     return !(operator==(o));
@@ -198,9 +203,9 @@ struct TypedRangeSegment {
   /*!
    * \brief Get a new TypedRangeSegment instance representing a slice of
    *        existing segment
-   * 
-   * \param begin start iterate of new range 
-   * \param length maximum length of new range 
+   *
+   * \param begin start iterate of new range
+   * \param length maximum length of new range
    * \return TypedRangeSegment representing the interval
    *         [ *begin() + begin, min( *begin() + begin + length, *end() ) )
    *
@@ -213,7 +218,7 @@ struct TypedRangeSegment {
    *     auto r = RAJA::TypedRangeSegment<int>(-4, 4);
    *
    *     // s repreents the subinterval  [-3, 2)
-   *     auto s = r.slice(1, 5); 
+   *     auto s = r.slice(1, 5);
    *
    *   \endverbatim
    */
@@ -221,9 +226,9 @@ struct TypedRangeSegment {
                                                        DiffT length) const
   {
     StorageT start = m_begin[0] + begin;
-    StorageT end = start + length > m_end[0] ? m_end[0] : start + length;
+    StorageT end   = start + length > m_end[0] ? m_end[0] : start + length;
 
-    return TypedRangeSegment{stripIndexType(start), stripIndexType(end)};
+    return TypedRangeSegment {stripIndexType(start), stripIndexType(end)};
   }
 
   /*!
@@ -243,12 +248,11 @@ struct TypedRangeSegment {
   iterator m_end;
 };
 
-
 /*!
  ******************************************************************************
  *
- * \class TypedRangeStrideSegment 
- * 
+ * \class TypedRangeStrideSegment
+ *
  * \brief  Segment class representing a strided range of typed indices
  *
  * \tparam StorageT underlying data type for the segment indices (required)
@@ -264,9 +268,9 @@ struct TypedRangeSegment {
  *
  * NOTE: TypedRangeStrideSegment::iterator is a RandomAccessIterator
  *
- * NOTE: TypedRangeStrideSegment allows for positive or negative strides and 
- *       indices. This allows for forward (stride > 0) or backward (stride < 0) 
- *       traversal of the iteration space. A stride of zero is undefined and 
+ * NOTE: TypedRangeStrideSegment allows for positive or negative strides and
+ *       indices. This allows for forward (stride > 0) or backward (stride < 0)
+ *       traversal of the iteration space. A stride of zero is undefined and
  *       will cause divide-by-zero errors.
  *
  * As with RangeSegment, the iteration space is inclusive of begin() and
@@ -275,7 +279,7 @@ struct TypedRangeSegment {
  * For positive strides, begin() > end() implies size()==0
  * For negative strides, begin() < end() implies size()==0
  *
- * NOTE: Proper handling of negative strides and indices requires that 
+ * NOTE: Proper handling of negative strides and indices requires that
  *       StorageT is a signed type.
  *
  * Usage:
@@ -321,15 +325,19 @@ struct TypedRangeSegment {
  *
  ******************************************************************************
  */
-template <typename StorageT, typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
-struct TypedRangeStrideSegment {
+template<typename StorageT,
+         typename DiffT = make_signed_t<strip_index_type_t<StorageT>>>
+struct TypedRangeStrideSegment
+{
 
   //
   // Static asserts to provide some useful error messages during compilation
   // for incorrect usage.
   //
-  static_assert(std::is_signed<DiffT>::value, "TypedRangeStrideSegment DiffT requires signed type.");
-  static_assert(!std::is_floating_point<StorageT>::value, "TypedRangeStrideSegment Type must be non floating point.");
+  static_assert(std::is_signed<DiffT>::value,
+                "TypedRangeStrideSegment DiffT requires signed type.");
+  static_assert(!std::is_floating_point<StorageT>::value,
+                "TypedRangeStrideSegment Type must be non floating point.");
 
   //@{
   //!   @name Types used in implementation based on template parameters.
@@ -349,7 +357,7 @@ struct TypedRangeStrideSegment {
   //!   @name Constructors, destructor, and copy assignment.
 
   /*!
-   * \brief Construct a range segment for the interval [begin, end) with 
+   * \brief Construct a range segment for the interval [begin, end) with
    *        given stride
    *
    * \param begin start value (inclusive) for the range
@@ -357,6 +365,7 @@ struct TypedRangeStrideSegment {
    * \param stride stride value when iterating over the range
    */
   using StripStorageT = strip_index_type_t<StorageT>;
+
   RAJA_HOST_DEVICE TypedRangeStrideSegment(StripStorageT begin,
                                            StripStorageT end,
                                            DiffT stride)
@@ -367,13 +376,16 @@ struct TypedRangeStrideSegment {
         m_size((end - begin + stride - (stride > 0 ? 1 : -1)) / stride)
   {
     // clamp range when end is unreachable from begin without wrapping
-    if (stride < 0 && end > begin) {
+    if (stride < 0 && end > begin)
+    {
       m_end = m_begin;
-    } else if (stride > 0 && end < begin) {
+    }
+    else if (stride > 0 && end < begin)
+    {
       m_end = m_begin;
     }
     // m_size initialized as negative indicates a zero iteration space
-    m_size = m_size < DiffT{0} ? DiffT{0} : m_size;
+    m_size = m_size < DiffT {0} ? DiffT {0} : m_size;
   }
 
   //! Disable compiler generated constructor
@@ -408,8 +420,8 @@ struct TypedRangeStrideSegment {
 
   /*!
    * \brief Get size of this segment
-   * 
-   * The size is the number of iterates in the 
+   *
+   * The size is the number of iterates in the
    * interval [begin, end) when striding over it
    */
   RAJA_HOST_DEVICE DiffT size() const { return m_size; }
@@ -435,7 +447,8 @@ struct TypedRangeStrideSegment {
    *
    * \return true if begin, end, or size does not match, else false
    */
-  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(TypedRangeStrideSegment const& o) const
+  RAJA_HOST_DEVICE RAJA_INLINE bool operator!=(
+      TypedRangeStrideSegment const& o) const
   {
     return !(operator==(o));
   }
@@ -450,7 +463,7 @@ struct TypedRangeStrideSegment {
    * \param length maximum length of new range
    *
    * \return TypedRangeStrideSegment representing the interval
-   *         [ *begin() + begin * stride, 
+   *         [ *begin() + begin * stride,
    *           min( *begin() + (begin + length) * stride, *end() )
    *
    * Here's an example of a slice operation on a range segment with a negative
@@ -466,24 +479,26 @@ struct TypedRangeStrideSegment {
    *     //       5 indices in r starting at the 6th entry
    *     auto s = r.slice(6, 6);
    *
-   *   \endverbatim 
+   *   \endverbatim
    */
   RAJA_HOST_DEVICE TypedRangeStrideSegment slice(StorageT begin,
                                                  DiffT length) const
   {
     StorageT stride = m_begin.get_stride();
-    StorageT start = m_begin[0] + begin * stride;
-    StorageT end = start + stride * length;
+    StorageT start  = m_begin[0] + begin * stride;
+    StorageT end    = start + stride * length;
 
-    if (stride > 0) {
+    if (stride > 0)
+    {
       end = end > m_end[0] ? m_end[0] : end;
-    } else {
+    }
+    else
+    {
       end = end < m_end[0] ? m_end[0] : end;
     }
 
-    return TypedRangeStrideSegment{stripIndexType(start),
-                                   stripIndexType(end),
-                                   m_begin.get_stride()};
+    return TypedRangeStrideSegment {stripIndexType(start), stripIndexType(end),
+                                    m_begin.get_stride()};
   }
 
   /*!
@@ -516,17 +531,18 @@ using RangeStrideSegment = TypedRangeStrideSegment<Index_type>;
 namespace detail
 {
 
-template <typename T, typename... Rest>
+template<typename T, typename... Rest>
 struct common_type
-    : std::common_type<T, typename std::common_type<Rest...>::type> {
-};
+    : std::common_type<T, typename std::common_type<Rest...>::type>
+{};
 
-template <typename T>
-struct common_type<T> {
+template<typename T>
+struct common_type<T>
+{
   using type = T;
 };
 
-template <typename... Ts>
+template<typename... Ts>
 using common_type_t = typename common_type<Ts...>::type;
 
 }  // namespace detail
@@ -539,9 +555,9 @@ using common_type_t = typename common_type<Ts...>::type;
  *          @begin and @end. If there is no common type, then
  *          a compiler error will be produced.
  */
-template <typename BeginT,
-          typename EndT,
-          typename Common = detail::common_type_t<BeginT, EndT>>
+template<typename BeginT,
+         typename EndT,
+         typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
                                                       EndT&& end)
 {
@@ -549,7 +565,7 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
 }
 
 /*!
- * \brief Function to make a TypedRangeStride Segment for the interval 
+ * \brief Function to make a TypedRangeStride Segment for the interval
  *        [begin, end) with given stride
  *
  *  \return a newly constructed TypedRangeStrideSegment where
@@ -557,32 +573,35 @@ RAJA_HOST_DEVICE TypedRangeSegment<Common> make_range(BeginT&& begin,
  *          @begin, @end, and @stride. If there is no common
  *          type, then a compiler error will be produced.
  */
-template <typename BeginT,
-          typename EndT,
-          typename StrideT,
-          typename Common = detail::common_type_t<BeginT, EndT>>
+template<typename BeginT,
+         typename EndT,
+         typename StrideT,
+         typename Common = detail::common_type_t<BeginT, EndT>>
 RAJA_HOST_DEVICE TypedRangeStrideSegment<Common> make_strided_range(
     BeginT&& begin,
     EndT&& end,
     StrideT&& stride)
 {
-  static_assert(std::is_signed<StrideT>::value, "make_strided_segment : stride must be signed.");
-  static_assert(std::is_same<make_signed_t<EndT>, StrideT>::value, "make_stride_segment : stride and end must be of similar types.");
+  static_assert(std::is_signed<StrideT>::value,
+                "make_strided_segment : stride must be signed.");
+  static_assert(
+      std::is_same<make_signed_t<EndT>, StrideT>::value,
+      "make_stride_segment : stride and end must be of similar types.");
   return {begin, end, stride};
 }
 
 namespace concepts
 {
 
-template <typename T, typename U>
+template<typename T, typename U>
 struct RangeConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U>>())
+{};
 
-template <typename T, typename U, typename V>
+template<typename T, typename U, typename V>
 struct RangeStrideConstructible
-    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>()) {
-};
+    : DefineConcept(camp::val<RAJA::detail::common_type_t<T, U, V>>())
+{};
 
 }  // namespace concepts
 
@@ -603,7 +622,7 @@ namespace std
 {
 
 //! Specialization of std::swap for TypedRangeSegment
-template <typename T>
+template<typename T>
 RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
                                        RAJA::TypedRangeSegment<T>& b)
 {
@@ -611,7 +630,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeSegment<T>& a,
 }
 
 //! Specialization of std::swap for TypedRangeStrideSegment
-template <typename T>
+template<typename T>
 RAJA_HOST_DEVICE RAJA_INLINE void swap(RAJA::TypedRangeStrideSegment<T>& a,
                                        RAJA::TypedRangeStrideSegment<T>& b)
 {
diff --git a/include/RAJA/internal/DepGraphNode.hpp b/include/RAJA/internal/DepGraphNode.hpp
index 8feceae22f..3c6a3a0d91 100644
--- a/include/RAJA/internal/DepGraphNode.hpp
+++ b/include/RAJA/internal/DepGraphNode.hpp
@@ -56,9 +56,10 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   /// Default ctor initializes node to default state.
   ///
   DepGraphNode()
-      : m_num_dep_tasks(0), m_semaphore_reload_value(0), m_semaphore_value(0)
-  {
-  }
+      : m_num_dep_tasks(0),
+        m_semaphore_reload_value(0),
+        m_semaphore_value(0)
+  {}
 
   ///
   /// Get/set semaphore value; i.e., the current number of (unsatisfied)
@@ -82,7 +83,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void satisfyOne()
   {
-    if (m_semaphore_value > 0) {
+    if (m_semaphore_value > 0)
+    {
       --m_semaphore_value;
     }
   }
@@ -92,7 +94,8 @@ class RAJA_ALIGNED_ATTR(256) DepGraphNode
   ///
   void wait()
   {
-    while (m_semaphore_value > 0) {
+    while (m_semaphore_value > 0)
+    {
       // TODO: an efficient wait would be better here, but the standard
       // promise/future is not good enough
       std::this_thread::yield();
diff --git a/include/RAJA/internal/Iterators.hpp b/include/RAJA/internal/Iterators.hpp
index 6f32a56e6d..fd838943c2 100644
--- a/include/RAJA/internal/Iterators.hpp
+++ b/include/RAJA/internal/Iterators.hpp
@@ -38,7 +38,7 @@ namespace Iterators
 // Containers
 
 #if defined(RAJA_ENABLE_ITERATOR_OVERFLOW_DEBUG)
-template <typename LType, typename RType>
+template<typename LType, typename RType>
 std::string overflow_msg(LType lhs, RType rhs)
 {
   return "Iterator Overflow detected between operation of :\n\ttype : " +
@@ -47,10 +47,11 @@ std::string overflow_msg(LType lhs, RType rhs)
          "\n";
 }
 
-template <typename Type, typename DifferenceType>
+template<typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
 {
-  if (std::is_unsigned<Type>::value) {
+  if (std::is_unsigned<Type>::value)
+  {
     if ((rhs > 0) && (lhs > std::numeric_limits<Type>::max() - rhs))
       return true;
     if ((rhs < 0) && (lhs < std::numeric_limits<Type>::min() - rhs))
@@ -59,23 +60,27 @@ RAJA_HOST_DEVICE bool is_addition_overflow(Type lhs, DifferenceType rhs)
   return false;
 }
 
-template <typename Type, typename DifferenceType>
+template<typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
                                               DifferenceType rhs,
                                               bool iterator_on_left = true)
 {
-  if (iterator_on_left) {
+  if (iterator_on_left)
+  {
 
-    if (std::is_unsigned<Type>::value) {
+    if (std::is_unsigned<Type>::value)
+    {
       if ((rhs > 0) && (lhs < std::numeric_limits<Type>::min() + rhs))
         return true;
       if ((rhs < 0) && (lhs > std::numeric_limits<Type>::max() + rhs))
         return true;
     }
+  }
+  else
+  {  // Special case where operation is : value(lhs) - iterator(rhs).
 
-  } else {  // Special case where operation is : value(lhs) - iterator(rhs).
-
-    if (std::is_unsigned<DifferenceType>::value) {
+    if (std::is_unsigned<DifferenceType>::value)
+    {
       if ((lhs > 0) && (rhs < std::numeric_limits<DifferenceType>::min() + lhs))
         return true;
       if ((lhs < 0)) return true;
@@ -84,14 +89,14 @@ RAJA_HOST_DEVICE bool is_subtraction_overflow(Type lhs,
   return false;
 }
 
-template <typename Type, typename DifferenceType>
+template<typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE void check_is_addition_overflow(Type lhs, DifferenceType rhs)
 {
   if (is_addition_overflow(lhs, rhs))
     throw std::runtime_error(overflow_msg(lhs, rhs));
 }
 
-template <typename Type, typename DifferenceType>
+template<typename Type, typename DifferenceType>
 RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
                                                     DifferenceType rhs)
 {
@@ -100,29 +105,28 @@ RAJA_HOST_DEVICE void check_is_subtraction_overflow(Type lhs,
 }
 #endif
 
-template <typename Type = Index_type,
-          typename DifferenceType = Type,
-          typename PointerType = Type*>
+template<typename Type           = Index_type,
+         typename DifferenceType = Type,
+         typename PointerType    = Type*>
 class numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = PointerType;
-  using reference = value_type&;
-  using iterator_category = std::random_access_iterator_tag;
-
-  constexpr numeric_iterator() noexcept = default;
-  constexpr numeric_iterator(const numeric_iterator&) noexcept = default;
-  constexpr numeric_iterator(numeric_iterator&&) noexcept = default;
+  using difference_type     = DifferenceType;
+  using pointer             = PointerType;
+  using reference           = value_type&;
+  using iterator_category   = std::random_access_iterator_tag;
+
+  constexpr numeric_iterator() noexcept                         = default;
+  constexpr numeric_iterator(const numeric_iterator&) noexcept  = default;
+  constexpr numeric_iterator(numeric_iterator&&) noexcept       = default;
   numeric_iterator& operator=(const numeric_iterator&) noexcept = default;
-  numeric_iterator& operator=(numeric_iterator&&) noexcept = default;
+  numeric_iterator& operator=(numeric_iterator&&) noexcept      = default;
 
   RAJA_HOST_DEVICE constexpr numeric_iterator(const stripped_value_type& rhs)
       : val(rhs)
-  {
-  }
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return 1; }
 
@@ -130,22 +134,27 @@ class numeric_iterator
   {
     return val == rhs.val;
   }
+
   RAJA_HOST_DEVICE inline bool operator!=(const numeric_iterator& rhs) const
   {
     return val != rhs.val;
   }
+
   RAJA_HOST_DEVICE inline bool operator>(const numeric_iterator& rhs) const
   {
     return val > rhs.val;
   }
+
   RAJA_HOST_DEVICE inline bool operator<(const numeric_iterator& rhs) const
   {
     return val < rhs.val;
   }
+
   RAJA_HOST_DEVICE inline bool operator>=(const numeric_iterator& rhs) const
   {
     return val >= rhs.val;
   }
+
   RAJA_HOST_DEVICE inline bool operator<=(const numeric_iterator& rhs) const
   {
     return val <= rhs.val;
@@ -156,17 +165,20 @@ class numeric_iterator
     ++val;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator& operator--()
   {
     --val;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator operator++(int)
   {
     numeric_iterator tmp(*this);
     ++val;
     return tmp;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator operator--(int)
   {
     numeric_iterator tmp(*this);
@@ -183,6 +195,7 @@ class numeric_iterator
     val += rhs;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
       const difference_type& rhs)
   {
@@ -192,12 +205,14 @@ class numeric_iterator
     val -= rhs;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator& operator+=(
       const numeric_iterator& rhs)
   {
     val += rhs.val;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator& operator-=(
       const numeric_iterator& rhs)
   {
@@ -210,11 +225,13 @@ class numeric_iterator
   {
     return val + rhs.val;
   }
+
   RAJA_HOST_DEVICE inline stripped_value_type operator-(
       const numeric_iterator& rhs) const
   {
     return val - rhs.val;
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator operator+(
       const difference_type& rhs) const
   {
@@ -223,6 +240,7 @@ class numeric_iterator
 #endif
     return numeric_iterator(val + rhs);
   }
+
   RAJA_HOST_DEVICE inline numeric_iterator operator-(
       const difference_type& rhs) const
   {
@@ -231,6 +249,7 @@ class numeric_iterator
 #endif
     return numeric_iterator(val - rhs);
   }
+
   RAJA_HOST_DEVICE friend constexpr numeric_iterator operator+(
       difference_type lhs,
       const numeric_iterator& rhs)
@@ -243,6 +262,7 @@ class numeric_iterator
     return numeric_iterator(lhs + rhs.val);
 #endif
   }
+
   RAJA_HOST_DEVICE friend constexpr numeric_iterator operator-(
       difference_type lhs,
       const numeric_iterator& rhs)
@@ -260,10 +280,12 @@ class numeric_iterator
   {
     return value_type(val);
   }
+
   RAJA_HOST_DEVICE inline value_type operator->() const
   {
     return value_type(val);
   }
+
   RAJA_HOST_DEVICE constexpr value_type operator[](difference_type rhs) const
   {
     return value_type(val + rhs);
@@ -273,31 +295,35 @@ class numeric_iterator
   stripped_value_type val = 0;
 };
 
-template <typename Type = Index_type,
-          typename DifferenceType = Type,
-          typename PointerType = Type*>
+template<typename Type           = Index_type,
+         typename DifferenceType = Type,
+         typename PointerType    = Type*>
 class strided_numeric_iterator
 {
 public:
-  using value_type = Type;
+  using value_type          = Type;
   using stripped_value_type = strip_index_type_t<Type>;
-  using difference_type = DifferenceType;
-  using pointer = DifferenceType*;
-  using reference = DifferenceType&;
-  using iterator_category = std::random_access_iterator_tag;
+  using difference_type     = DifferenceType;
+  using pointer             = DifferenceType*;
+  using reference           = DifferenceType&;
+  using iterator_category   = std::random_access_iterator_tag;
 
   constexpr strided_numeric_iterator() noexcept = default;
-  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept = default;
-  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept = default;
-  strided_numeric_iterator& operator=(const strided_numeric_iterator&) noexcept = default;
-  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept = default;
+  constexpr strided_numeric_iterator(const strided_numeric_iterator&) noexcept =
+      default;
+  constexpr strided_numeric_iterator(strided_numeric_iterator&&) noexcept =
+      default;
+  strided_numeric_iterator& operator=(
+      const strided_numeric_iterator&) noexcept = default;
+  strided_numeric_iterator& operator=(strided_numeric_iterator&&) noexcept =
+      default;
 
   RAJA_HOST_DEVICE constexpr strided_numeric_iterator(
       stripped_value_type rhs,
       DifferenceType stride_ = DifferenceType(1))
-      : val(rhs), stride(stride_)
-  {
-  }
+      : val(rhs),
+        stride(stride_)
+  {}
 
   RAJA_HOST_DEVICE inline DifferenceType get_stride() const { return stride; }
 
@@ -306,6 +332,7 @@ class strided_numeric_iterator
     val += stride;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline strided_numeric_iterator& operator--()
   {
     val -= stride;
@@ -321,6 +348,7 @@ class strided_numeric_iterator
     val += rhs * stride;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline strided_numeric_iterator& operator-=(
       const difference_type& rhs)
   {
@@ -338,16 +366,18 @@ class strided_numeric_iterator
             (static_cast<difference_type>(rhs.val))) /
            stride;
   }
+
   RAJA_HOST_DEVICE inline difference_type operator-(
       const strided_numeric_iterator& rhs) const
   {
     difference_type diff = (static_cast<difference_type>(val) -
                             (static_cast<difference_type>(rhs.val)));
 
-    return (diff % stride != difference_type{0})
-               ? (difference_type{1} + diff / stride)
+    return (diff % stride != difference_type {0})
+               ? (difference_type {1} + diff / stride)
                : diff / stride;
   }
+
   RAJA_HOST_DEVICE inline strided_numeric_iterator operator+(
       const difference_type& rhs) const
   {
@@ -356,6 +386,7 @@ class strided_numeric_iterator
 #endif
     return strided_numeric_iterator(val + rhs * stride, stride);
   }
+
   RAJA_HOST_DEVICE inline strided_numeric_iterator operator-(
       const difference_type& rhs) const
   {
@@ -372,6 +403,7 @@ class strided_numeric_iterator
   {
     return (val - rhs.val) / stride;
   }
+
   RAJA_HOST_DEVICE inline bool operator==(
       const strided_numeric_iterator& rhs) const
   {
@@ -383,31 +415,35 @@ class strided_numeric_iterator
   {
     return val * stride > rhs.val * stride;
   }
+
   RAJA_HOST_DEVICE inline bool operator<(
       const strided_numeric_iterator& rhs) const
   {
     return val * stride < rhs.val * stride;
   }
+
   RAJA_HOST_DEVICE inline bool operator>=(
       const strided_numeric_iterator& rhs) const
   {
     return val * stride >= rhs.val * stride;
   }
+
   RAJA_HOST_DEVICE inline bool operator<=(
       const strided_numeric_iterator& rhs) const
   {
     return val * stride <= rhs.val * stride;
   }
 
-
   RAJA_HOST_DEVICE inline value_type operator*() const
   {
     return value_type(val);
   }
+
   RAJA_HOST_DEVICE inline value_type operator->() const
   {
     return value_type(val);
   }
+
   RAJA_HOST_DEVICE constexpr value_type operator[](difference_type rhs) const
   {
     return value_type(val + rhs * stride);
@@ -415,7 +451,7 @@ class strided_numeric_iterator
 
 private:
   stripped_value_type val = 0;
-  DifferenceType stride = 1;
+  DifferenceType stride   = 1;
 };
 
 
diff --git a/include/RAJA/internal/MemUtils_CPU.hpp b/include/RAJA/internal/MemUtils_CPU.hpp
index 55015f9ab7..1281c36277 100644
--- a/include/RAJA/internal/MemUtils_CPU.hpp
+++ b/include/RAJA/internal/MemUtils_CPU.hpp
@@ -27,7 +27,7 @@
 
 #include "RAJA/util/types.hpp"
 
-#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || \
+#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) ||                \
     defined(__MINGW32__) || defined(__BORLANDC__)
 #define RAJA_PLATFORM_WINDOWS
 #include <malloc.h>
@@ -44,7 +44,7 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #if defined(RAJA_HAVE_POSIX_MEMALIGN)
   // posix_memalign available
   void* ret = nullptr;
-  int err = posix_memalign(&ret, alignment, size);
+  int err   = posix_memalign(&ret, alignment, size);
   return err ? nullptr : ret;
 #elif defined(RAJA_HAVE_ALIGNED_ALLOC)
   return std::aligned_alloc(alignment, size);
@@ -53,27 +53,25 @@ inline void* allocate_aligned(size_t alignment, size_t size)
 #elif defined(RAJA_PLATFORM_WINDOWS)
   return _aligned_malloc(size, alignment);
 #else
-  char *mem = (char *)malloc(size + alignment + sizeof(void *));
+  char* mem = (char*)malloc(size + alignment + sizeof(void*));
   if (nullptr == mem) return nullptr;
-  void **ptr = (void **)((std::uintptr_t)(mem + alignment + sizeof(void *)) &
-                         ~(alignment - 1));
+  void** ptr = (void**)((std::uintptr_t)(mem + alignment + sizeof(void*)) &
+                        ~(alignment - 1));
   // Store the original address one position behind what we give the user.
   ptr[-1] = mem;
   return ptr;
 #endif
 }
 
-
 ///
 /// Portable aligned memory allocation
 ///
-template <typename T>
+template<typename T>
 inline T* allocate_aligned_type(size_t alignment, size_t size)
 {
   return reinterpret_cast<T*>(allocate_aligned(alignment, size));
 }
 
-
 ///
 /// Portable aligned memory free - required for Windows
 ///
@@ -97,25 +95,23 @@ inline void free_aligned(void* ptr)
 ///
 struct FreeAligned
 {
-  void operator()(void* ptr)
-  {
-    free_aligned(ptr);
-  }
+  void operator()(void* ptr) { free_aligned(ptr); }
 };
 
 ///
 /// Deleter function object for memory allocated with allocate_aligned_type
 /// that calls the destructor for the fist size objects in the storage.
 ///
-template < typename T, typename index_type >
+template<typename T, typename index_type>
 struct FreeAlignedType : FreeAligned
 {
   index_type size = 0;
 
   void operator()(T* ptr)
   {
-    for ( index_type i = size; i > 0; --i ) {
-      ptr[i-1].~T();
+    for (index_type i = size; i > 0; --i)
+    {
+      ptr[i - 1].~T();
     }
     FreeAligned::operator()(ptr);
   }
diff --git a/include/RAJA/internal/RAJAVec.hpp b/include/RAJA/internal/RAJAVec.hpp
index 1d0ec0cbeb..b4366f41a5 100644
--- a/include/RAJA/internal/RAJAVec.hpp
+++ b/include/RAJA/internal/RAJAVec.hpp
@@ -49,7 +49,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename T, typename Allocator = std::allocator<T> >
+template<typename T, typename Allocator = std::allocator<T>>
 class RAJAVec
 {
   using allocator_traits_type = std::allocator_traits<Allocator>;
@@ -57,26 +57,30 @@ class RAJAVec
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
+
 public:
-  using value_type = T;
-  using allocator_type = Allocator;
-  using size_type = std::size_t;
+  using value_type      = T;
+  using allocator_type  = Allocator;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = typename allocator_traits_type::pointer;
-  using const_pointer = typename allocator_traits_type::const_pointer;
-  using iterator = value_type*;
-  using const_iterator = const value_type*;
+  using pointer         = typename allocator_traits_type::pointer;
+  using const_pointer   = typename allocator_traits_type::const_pointer;
+  using iterator        = value_type*;
+  using const_iterator  = const value_type*;
 
   ///
   /// Construct empty vector with given capacity.
   ///
-  explicit RAJAVec(size_type init_cap = 0,
+  explicit RAJAVec(size_type init_cap      = 0,
                    const allocator_type& a = allocator_type())
-      : m_data(nullptr), m_allocator(a), m_capacity(0), m_size(0)
+      : m_data(nullptr),
+        m_allocator(a),
+        m_capacity(0),
+        m_size(0)
   {
     reserve(init_cap);
   }
@@ -86,7 +90,9 @@ class RAJAVec
   ///
   RAJAVec(const RAJAVec& other)
       : m_data(nullptr),
-        m_allocator(allocator_traits_type::select_on_container_copy_construction(other.m_allocator)),
+        m_allocator(
+            allocator_traits_type::select_on_container_copy_construction(
+                other.m_allocator)),
         m_capacity(0),
         m_size(0)
   {
@@ -103,9 +109,9 @@ class RAJAVec
         m_capacity(other.m_capacity),
         m_size(other.m_size)
   {
-    other.m_data = nullptr;
+    other.m_data     = nullptr;
     other.m_capacity = 0;
-    other.m_size = 0;
+    other.m_size     = 0;
   }
 
   ///
@@ -113,8 +119,9 @@ class RAJAVec
   ///
   RAJAVec& operator=(const RAJAVec& rhs)
   {
-    if (&rhs != this) {
-      copy_assign_private(rhs, propagate_on_container_copy_assignment{});
+    if (&rhs != this)
+    {
+      copy_assign_private(rhs, propagate_on_container_copy_assignment {});
     }
     return *this;
   }
@@ -124,8 +131,10 @@ class RAJAVec
   ///
   RAJAVec& operator=(RAJAVec&& rhs)
   {
-    if (&rhs != this) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (&rhs != this)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -144,31 +153,36 @@ class RAJAVec
   ///
   void swap(RAJAVec& other)
   {
-    swap_private(other, propagate_on_container_swap{});
+    swap_private(other, propagate_on_container_swap {});
   }
 
   ///
   /// Get a pointer to the beginning of the contiguous vector
   ///
-        pointer data()       { return m_data; }
+  pointer data() { return m_data; }
+
   ///
   const_pointer data() const { return m_data; }
 
   ///
   /// Get an iterator to the end.
   ///
-        iterator  end()       { return m_data + m_size; }
+  iterator end() { return m_data + m_size; }
+
   ///
-  const_iterator  end() const { return m_data + m_size; }
+  const_iterator end() const { return m_data + m_size; }
+
   ///
   const_iterator cend() const { return m_data + m_size; }
 
   ///
   /// Get an iterator to the beginning.
   ///
-        iterator  begin()       { return m_data; }
+  iterator begin() { return m_data; }
+
   ///
-  const_iterator  begin() const { return m_data; }
+  const_iterator begin() const { return m_data; }
+
   ///
   const_iterator cbegin() const { return m_data; }
 
@@ -200,18 +214,12 @@ class RAJAVec
   ///
   /// Shrink the capacity of the vector to the current size.
   ///
-  void shrink_to_fit()
-  {
-    shrink_cap(m_size);
-  }
+  void shrink_to_fit() { shrink_cap(m_size); }
 
   ///
   /// Empty vector of all data.
   ///
-  void clear()
-  {
-    destroy_items_after(0);
-  }
+  void clear() { destroy_items_after(0); }
 
   ///
   /// Change the size of the vector,
@@ -221,10 +229,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -237,10 +248,13 @@ class RAJAVec
   RAJA_INLINE
   void resize(size_type new_size, const_reference new_value)
   {
-    if (new_size >= size()) {
+    if (new_size >= size())
+    {
       reserve(new_size);
       construct_items_back(new_size, new_value);
-    } else {
+    }
+    else
+    {
       destroy_items_after(new_size);
     }
   }
@@ -248,52 +262,62 @@ class RAJAVec
   ///
   /// Bracket operator accessor.
   ///
-        reference operator[](difference_type i)       { return m_data[i]; }
+  reference operator[](difference_type i) { return m_data[i]; }
+
   ///
   const_reference operator[](difference_type i) const { return m_data[i]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference front()       { return m_data[0]; }
+  reference front() { return m_data[0]; }
+
   ///
   const_reference front() const { return m_data[0]; }
 
   ///
   /// Access the last item of the vector.
   ///
-        reference back()       { return m_data[m_size-1]; }
+  reference back() { return m_data[m_size - 1]; }
+
   ///
-  const_reference back() const { return m_data[m_size-1]; }
+  const_reference back() const { return m_data[m_size - 1]; }
 
   ///
   /// Add item to front end of vector. Note that this operation is unique to
   /// this class; it is not part of the C++ standard library vector interface.
   ///
   void push_front(const_reference item) { emplace_front_private(item); }
+
   ///
-  void push_front(   value_type&& item) { emplace_front_private(std::move(item)); }
+  void push_front(value_type&& item) { emplace_front_private(std::move(item)); }
+
   ///
-  template < typename ... Os >
-  void emplace_front(Os&&... os) { emplace_front_private(std::forward<Os>(os)...); }
+  template<typename... Os>
+  void emplace_front(Os&&... os)
+  {
+    emplace_front_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Add item to back end of vector.
   ///
   void push_back(const_reference item) { emplace_back_private(item); }
+
   ///
-  void push_back(   value_type&& item) { emplace_back_private(std::move(item)); }
+  void push_back(value_type&& item) { emplace_back_private(std::move(item)); }
+
   ///
-  template < typename ... Os >
-  void emplace_back(Os&&... os) { emplace_back_private(std::forward<Os>(os)...); }
+  template<typename... Os>
+  void emplace_back(Os&&... os)
+  {
+    emplace_back_private(std::forward<Os>(os)...);
+  }
 
   ///
   /// Remove the last item of the vector.
   ///
-  void pop_back()
-  {
-    destroy_items_after(m_size-1);
-  }
+  void pop_back() { destroy_items_after(m_size - 1); }
 
 private:
   pointer m_data;
@@ -307,13 +331,14 @@ class RAJAVec
   ///
   void copy_assign_private(RAJAVec const& rhs, std::true_type)
   {
-    if (m_allocator != rhs.m_allocator) {
+    if (m_allocator != rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
       m_allocator = rhs.m_allocator;
     }
 
-    copy_assign_private(rhs, std::false_type{});
+    copy_assign_private(rhs, std::false_type {});
   }
 
   ///
@@ -323,10 +348,13 @@ class RAJAVec
   void copy_assign_private(RAJAVec const& rhs, std::false_type)
   {
     reserve(rhs.size());
-    if (size() < rhs.size()) {
+    if (size() < rhs.size())
+    {
       copy_assign_items(0, size(), rhs.data());
       copy_construct_items_back(rhs.size(), rhs.data());
-    } else {
+    }
+    else
+    {
       copy_assign_items(0, rhs.size(), rhs.data());
       destroy_items_after(size());
     }
@@ -341,14 +369,14 @@ class RAJAVec
     clear();
     shrink_to_fit();
 
-    m_data = rhs.m_data;
+    m_data      = rhs.m_data;
     m_allocator = std::move(rhs.m_allocator);
-    m_capacity = rhs.m_capacity;
-    m_size = rhs.m_size;
+    m_capacity  = rhs.m_capacity;
+    m_size      = rhs.m_size;
 
-    rhs.m_data = nullptr;
+    rhs.m_data     = nullptr;
     rhs.m_capacity = 0;
-    rhs.m_size = 0;
+    rhs.m_size     = 0;
   }
 
   ///
@@ -357,23 +385,29 @@ class RAJAVec
   ///
   void move_assign_private(RAJAVec&& rhs, std::false_type)
   {
-    if (m_allocator == rhs.m_allocator) {
+    if (m_allocator == rhs.m_allocator)
+    {
       clear();
       shrink_to_fit();
 
-      m_data = rhs.m_data;
+      m_data     = rhs.m_data;
       m_capacity = rhs.m_capacity;
-      m_size = rhs.m_size;
+      m_size     = rhs.m_size;
 
-      rhs.m_data = nullptr;
+      rhs.m_data     = nullptr;
       rhs.m_capacity = 0;
-      rhs.m_size = 0;
-    } else {
+      rhs.m_size     = 0;
+    }
+    else
+    {
       reserve(rhs.size());
-      if (size() < rhs.size()) {
+      if (size() < rhs.size())
+      {
         move_assign_items(0, size(), rhs.data());
         move_construct_items_back(rhs.size(), rhs.data());
-      } else {
+      }
+      else
+      {
         move_assign_items(0, rhs.size(), rhs.data());
         destroy_items_after(size());
       }
@@ -386,10 +420,10 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::true_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
+    swap(m_data, other.m_data);
     swap(m_allocator, other.m_allocator);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   ///
@@ -398,9 +432,9 @@ class RAJAVec
   void swap_private(RAJAVec& other, std::false_type)
   {
     using std::swap;
-    swap(m_data,      other.m_data);
-    swap(m_capacity,  other.m_capacity);
-    swap(m_size,      other.m_size);
+    swap(m_data, other.m_data);
+    swap(m_capacity, other.m_capacity);
+    swap(m_size, other.m_size);
   }
 
   //
@@ -408,7 +442,8 @@ class RAJAVec
   //
   void copy_assign_items(size_type first, size_type last, const_pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = o_data[i];
     }
   }
@@ -418,7 +453,8 @@ class RAJAVec
   //
   void move_assign_items(size_type first, size_type last, pointer o_data)
   {
-    for (size_type i = first; i < last; ++i) {
+    for (size_type i = first; i < last; ++i)
+    {
       m_data[i] = std::move(o_data[i]);
     }
   }
@@ -426,11 +462,13 @@ class RAJAVec
   //
   // Construct items [m_size, new_size) from args.
   //
-  template < typename ... Os >
+  template<typename... Os>
   void construct_items_back(size_type new_size, Os&&... os)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::forward<Os>(os)...);
     }
   }
 
@@ -439,8 +477,10 @@ class RAJAVec
   //
   void copy_construct_items_back(size_type new_size, const_pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, o_data[m_size]);
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       o_data[m_size]);
     }
   }
 
@@ -449,8 +489,10 @@ class RAJAVec
   //
   void move_construct_items_back(size_type new_size, pointer o_data)
   {
-    for (; m_size < new_size; ++m_size) {
-      allocator_traits_type::construct(m_allocator, m_data+m_size, std::move(o_data[m_size]));
+    for (; m_size < new_size; ++m_size)
+    {
+      allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                       std::move(o_data[m_size]));
     }
   }
 
@@ -459,39 +501,45 @@ class RAJAVec
   //
   void destroy_items_after(size_type new_end)
   {
-    for (; m_size > new_end; --m_size) {
-      allocator_traits_type::destroy(m_allocator, m_data+m_size-1);
+    for (; m_size > new_end; --m_size)
+    {
+      allocator_traits_type::destroy(m_allocator, m_data + m_size - 1);
     }
   }
 
   //
   // Add an item to the front, shifting all existing items back one.
   //
-  template < typename ... Os >
+  template<typename... Os>
   void emplace_front_private(Os&&... os)
   {
     reserve(m_size + 1);
 
-    if (m_size > 0) {
+    if (m_size > 0)
+    {
       size_type i = m_size;
-      allocator_traits_type::construct(m_allocator, m_data+i, std::move(m_data[i - 1]));
-      for (--i; i > 0; --i) {
+      allocator_traits_type::construct(m_allocator, m_data + i,
+                                       std::move(m_data[i - 1]));
+      for (--i; i > 0; --i)
+      {
         m_data[i] = std::move(m_data[i - 1]);
       }
       allocator_traits_type::destroy(m_allocator, m_data);
     }
-    allocator_traits_type::construct(m_allocator, m_data, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
   //
   // Add an item to the back.
   //
-  template < typename ... Os >
+  template<typename... Os>
   void emplace_back_private(Os&&... os)
   {
     reserve(m_size + 1);
-    allocator_traits_type::construct(m_allocator, m_data+m_size, std::forward<Os>(os)...);
+    allocator_traits_type::construct(m_allocator, m_data + m_size,
+                                     std::forward<Os>(os)...);
     m_size++;
   }
 
@@ -501,7 +549,7 @@ class RAJAVec
   // relying on STL directly.
   //
   static constexpr const size_type s_init_cap = 8;
-  static constexpr const double s_grow_fac = 1.5;
+  static constexpr const double s_grow_fac    = 1.5;
 
   //
   // Get the next value for capacity given a target and minimum.
@@ -509,7 +557,8 @@ class RAJAVec
   size_type get_next_cap(size_type target_size)
   {
     size_type next_cap = s_init_cap;
-    if (m_capacity != 0) {
+    if (m_capacity != 0)
+    {
       next_cap = static_cast<size_type>(m_capacity * s_grow_fac);
     }
     return std::max(target_size, next_cap);
@@ -520,7 +569,8 @@ class RAJAVec
   //
   void grow_cap(size_type target_size)
   {
-    if (m_capacity < target_size) {
+    if (m_capacity < target_size)
+    {
       change_cap(get_next_cap(target_size));
     }
   }
@@ -530,7 +580,8 @@ class RAJAVec
   //
   void shrink_cap(size_type target_size)
   {
-    if (m_capacity > target_size) {
+    if (m_capacity > target_size)
+    {
       change_cap(std::max(m_size, target_size));
     }
   }
@@ -542,19 +593,23 @@ class RAJAVec
   void change_cap(size_type next_cap)
   {
     pointer tdata = nullptr;
-    if (next_cap != 0) {
+    if (next_cap != 0)
+    {
       tdata = allocator_traits_type::allocate(m_allocator, next_cap);
     }
 
-    if (m_data) {
-      for (size_type i = 0; i < m_size; ++i) {
-        allocator_traits_type::construct(m_allocator, tdata+i, std::move(m_data[i]));
-        allocator_traits_type::destroy(m_allocator, m_data+i);
+    if (m_data)
+    {
+      for (size_type i = 0; i < m_size; ++i)
+      {
+        allocator_traits_type::construct(m_allocator, tdata + i,
+                                         std::move(m_data[i]));
+        allocator_traits_type::destroy(m_allocator, m_data + i);
       }
       allocator_traits_type::deallocate(m_allocator, m_data, m_capacity);
     }
 
-    m_data = tdata;
+    m_data     = tdata;
     m_capacity = next_cap;
   }
 };
diff --git a/include/RAJA/internal/fault_tolerance.hpp b/include/RAJA/internal/fault_tolerance.hpp
index cf3a86cede..66d03ca6cd 100644
--- a/include/RAJA/internal/fault_tolerance.hpp
+++ b/include/RAJA/internal/fault_tolerance.hpp
@@ -37,60 +37,72 @@
 #include <stdio.h>
 #include "cycle.h"
 
-#define RAJA_FT_BEGIN                          \
-  extern volatile int fault_type;              \
-  bool repeat;                                 \
-  bool do_time = false;                        \
-  ticks start = 0, stop = 0;                   \
-  if (fault_type != 0) {                       \
-    printf("Uncaught fault %d\n", fault_type); \
-    fault_type = 0;                            \
-  }                                            \
-  do {                                         \
-    repeat = false;                            \
-    if (do_time) {                             \
-      start = getticks();                      \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  bool do_time = false;                                                        \
+  ticks start = 0, stop = 0;                                                   \
+  if (fault_type != 0)                                                         \
+  {                                                                            \
+    printf("Uncaught fault %d\n", fault_type);                                 \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  do                                                                           \
+  {                                                                            \
+    repeat = false;                                                            \
+    if (do_time)                                                               \
+    {                                                                          \
+      start = getticks();                                                      \
     }
 
-#define RAJA_FT_END                                                          \
-  if (do_time) {                                                             \
-    stop = getticks();                                                       \
-    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start)); \
-    do_time = false;                                                         \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type < 0) {                                                      \
-    printf("Unrecoverable fault (restart penalty)\n");                       \
-    fault_type = 0;                                                          \
-  }                                                                          \
-  if (fault_type > 0) {                                                      \
-    /* invalidate cache */                                                   \
-    repeat = true;                                                           \
-    do_time = true;                                                          \
-  }                                                                          \
-  }                                                                          \
-  while (repeat == true)                                                     \
+#define RAJA_FT_END                                                            \
+  if (do_time)                                                                 \
+  {                                                                            \
+    stop = getticks();                                                         \
+    printf("recoverable fault clock cycles = %16f\n", elapsed(stop, start));   \
+    do_time    = false;                                                        \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type < 0)                                                          \
+  {                                                                            \
+    printf("Unrecoverable fault (restart penalty)\n");                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat  = true;                                                            \
+    do_time = true;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
     ;
 
 #else
-#define RAJA_FT_BEGIN             \
-  extern volatile int fault_type; \
-  bool repeat;                    \
-  if (fault_type == 0) {          \
-    do {                          \
+#define RAJA_FT_BEGIN                                                          \
+  extern volatile int fault_type;                                              \
+  bool repeat;                                                                 \
+  if (fault_type == 0)                                                         \
+  {                                                                            \
+    do                                                                         \
+    {                                                                          \
       repeat = false;
 
-#define RAJA_FT_END        \
-  if (fault_type > 0) {    \
-    /* invalidate cache */ \
-    repeat = true;         \
-    fault_type = 0;        \
-  }                        \
-  }                        \
-  while (repeat == true)   \
-    ;                      \
-  }                        \
-  else { fault_type = 0; /* ignore for the simulation */ }
+#define RAJA_FT_END                                                            \
+  if (fault_type > 0)                                                          \
+  {                                                                            \
+    /* invalidate cache */                                                     \
+    repeat     = true;                                                         \
+    fault_type = 0;                                                            \
+  }                                                                            \
+  }                                                                            \
+  while (repeat == true)                                                       \
+    ;                                                                          \
+  }                                                                            \
+  else                                                                         \
+  {                                                                            \
+    fault_type = 0; /* ignore for the simulation */                            \
+  }
 
 #endif  // RAJA_REPORT_FT
 
diff --git a/include/RAJA/internal/foldl.hpp b/include/RAJA/internal/foldl.hpp
index af65c05392..844159f82b 100644
--- a/include/RAJA/internal/foldl.hpp
+++ b/include/RAJA/internal/foldl.hpp
@@ -30,7 +30,6 @@
 
 #include "RAJA/util/macros.hpp"
 
-
 namespace RAJA
 {
 
@@ -40,47 +39,54 @@ namespace RAJA
 namespace detail
 {
 // FoldL
-template <typename Op, typename... Rest>
+template<typename Op, typename... Rest>
 struct foldl_impl;
 
-template <typename Op, typename Arg1>
-struct foldl_impl<Op, Arg1> {
+template<typename Op, typename Arg1>
+struct foldl_impl<Op, Arg1>
+{
   using Ret = Arg1;
 };
 
 #if RAJA_HAS_CXX17_IS_INVOCABLE
 
-template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+template<typename Op, typename Arg1, typename Arg2>
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::invoke_result<Op, Arg1, Arg2>::type;
 };
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
-  using Ret = typename foldl_impl<
-      Op,
-      typename std::invoke_result<Op, typename std::invoke_result<Op, Arg1, Arg2>::type,
-                                      Arg3>::type,
-      Rest...>::Ret;
+template<typename Op,
+         typename Arg1,
+         typename Arg2,
+         typename Arg3,
+         typename... Rest>
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
+  using Ret =
+      typename foldl_impl<Op,
+                          typename std::invoke_result<
+                              Op,
+                              typename std::invoke_result<Op, Arg1, Arg2>::type,
+                              Arg3>::type,
+                          Rest...>::Ret;
 };
 
 #else
 
-template <typename Op, typename Arg1, typename Arg2>
-struct foldl_impl<Op, Arg1, Arg2> {
+template<typename Op, typename Arg1, typename Arg2>
+struct foldl_impl<Op, Arg1, Arg2>
+{
   using Ret = typename std::result_of<Op(Arg1, Arg2)>::type;
 };
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
-struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
+template<typename Op,
+         typename Arg1,
+         typename Arg2,
+         typename Arg3,
+         typename... Rest>
+struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...>
+{
   using Ret = typename foldl_impl<
       Op,
       typename std::result_of<Op(typename std::result_of<Op(Arg1, Arg2)>::type,
@@ -90,9 +96,9 @@ struct foldl_impl<Op, Arg1, Arg2, Arg3, Rest...> {
 
 #endif
 
-} // namespace detail
+}  // namespace detail
 
-template <typename Op, typename Arg1>
+template<typename Op, typename Arg1>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
     Op&& RAJA_UNUSED_ARG(operation),
     Arg1&& arg) -> typename detail::foldl_impl<Op, Arg1>::Ret
@@ -100,7 +106,7 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(
   return camp::forward<Arg1>(arg);
 }
 
-template <typename Op, typename Arg1, typename Arg2>
+template<typename Op, typename Arg1, typename Arg2>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                                                   Arg1&& arg1,
                                                   Arg2&& arg2) ->
@@ -110,11 +116,11 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                                       camp::forward<Arg2>(arg2));
 }
 
-template <typename Op,
-          typename Arg1,
-          typename Arg2,
-          typename Arg3,
-          typename... Rest>
+template<typename Op,
+         typename Arg1,
+         typename Arg2,
+         typename Arg3,
+         typename... Rest>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                                                   Arg1&& arg1,
                                                   Arg2&& arg2,
@@ -130,27 +136,26 @@ RAJA_HOST_DEVICE RAJA_INLINE constexpr auto foldl(Op&& operation,
                camp::forward<Rest>(rest)...);
 }
 
-
 // Convenience folds
-template <typename Result, typename... Args>
+template<typename Result, typename... Args>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr Result sum(Args... args)
 {
   return foldl(RAJA::operators::plus<Result>(), args...);
 }
 
-template <typename Result, typename... Args>
+template<typename Result, typename... Args>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr Result product(Args... args)
 {
   return foldl(RAJA::operators::multiplies<Result>(), args...);
 }
 
-template <typename Result, typename... Args>
+template<typename Result, typename... Args>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr Result max(Args... args)
 {
   return foldl(RAJA::operators::maximum<Result>(), args...);
 }
 
-template <typename Result, typename... Args>
+template<typename Result, typename... Args>
 RAJA_HOST_DEVICE RAJA_INLINE constexpr Result min(Args... args)
 {
   return foldl(RAJA::operators::minimum<Result>(), args...);
diff --git a/include/RAJA/internal/get_platform.hpp b/include/RAJA/internal/get_platform.hpp
index 0354d04bfd..ca568cf221 100644
--- a/include/RAJA/internal/get_platform.hpp
+++ b/include/RAJA/internal/get_platform.hpp
@@ -8,19 +8,23 @@
 namespace RAJA
 {
 
-namespace policy {
-namespace multi {
-template <typename Selector, typename... Policies>
+namespace policy
+{
+namespace multi
+{
+template<typename Selector, typename... Policies>
 class MultiPolicy;
 
 }
-}
+}  // namespace policy
 
-namespace detail 
+namespace detail
 {
 
-struct max_platform {
+struct max_platform
+{
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
   constexpr RAJA::Platform operator()(const RAJA::Platform& l,
                                       const RAJA::Platform& r) const
@@ -33,19 +37,20 @@ struct max_platform {
  * Returns the platform for the specified execution policy.
  * This is a catch-all, so anything undefined gets Platform::undefined
  */
-template <typename T, typename = void>
-struct get_platform {
+template<typename T, typename = void>
+struct get_platform
+{
   // catch-all: undefined platform
   static constexpr Platform value = Platform::undefined;
 };
 
-
 /*!
  * Takes a list of policies, extracts their platforms, and provides the
  * reduction of them all.
  */
-template <typename... Policies>
-struct get_platform_from_list {
+template<typename... Policies>
+struct get_platform_from_list
+{
   static constexpr Platform value =
       foldl(max_platform(), get_platform<Policies>::value...);
 };
@@ -53,42 +58,41 @@ struct get_platform_from_list {
 /*!
  * Define an empty list as Platform::undefined;
  */
-template <>
-struct get_platform_from_list<> {
+template<>
+struct get_platform_from_list<>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-
 /*!
  * Specialization to define the platform for anything derived from PolicyBase,
  * which should catch all standard policies.
  *
  * (not for MultiPolicy or nested::Policy)
  */
-template <typename T>
+template<typename T>
 struct get_platform<T,
-                    typename std::
-                        enable_if<std::is_base_of<RAJA::PolicyBase, T>::value
-                                  && !RAJA::type_traits::is_indexset_policy<T>::
-                                         value>::type> {
+                    typename std::enable_if<
+                        std::is_base_of<RAJA::PolicyBase, T>::value &&
+                        !RAJA::type_traits::is_indexset_policy<T>::value>::type>
+{
 
   static constexpr Platform value = T::platform;
 };
 
-
 /*!
  * Specialization to define the platform for an IndexSet execution policy.
  *
  * Examines both segment iteration and segment execution policies.
  */
-template <typename SEG, typename EXEC>
+template<typename SEG, typename EXEC>
 struct get_platform<RAJA::ExecPolicy<SEG, EXEC>>
-    : public get_platform_from_list<SEG, EXEC> {
-};
+    : public get_platform_from_list<SEG, EXEC>
+{};
 
-
-template <typename T>
-struct get_statement_platform {
+template<typename T>
+struct get_statement_platform
+{
   static constexpr Platform value =
       get_platform_from_list<typename T::execution_policy_t,
                              typename T::enclosed_statements_t>::value;
@@ -101,8 +105,9 @@ struct get_statement_platform {
  * This collects the Platform from each of it's statements, recursing into
  * each of them.
  */
-template <typename... Stmts>
-struct get_platform<RAJA::internal::StatementList<Stmts...>> {
+template<typename... Stmts>
+struct get_platform<RAJA::internal::StatementList<Stmts...>>
+{
   static constexpr Platform value =
       foldl(max_platform(), get_statement_platform<Stmts>::value...);
 };
@@ -110,21 +115,22 @@ struct get_platform<RAJA::internal::StatementList<Stmts...>> {
 /*!
  * Specialize for an empty statement list to be undefined
  */
-template <>
-struct get_platform<RAJA::internal::StatementList<>> {
+template<>
+struct get_platform<RAJA::internal::StatementList<>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-
 // Top level MultiPolicy shouldn't select a platform
 // Once a specific policy is selected, that policy will select the correct
 // platform... see policy_invoker in MultiPolicy.hpp
-template <typename SELECTOR, typename... POLICIES>
-struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>> {
+template<typename SELECTOR, typename... POLICIES>
+struct get_platform<RAJA::policy::multi::MultiPolicy<SELECTOR, POLICIES...>>
+{
   static constexpr Platform value = Platform::undefined;
 };
 
-} // closing brace for detail namespace
-} // closing brace for RAJA namespace
+}  // namespace detail
+}  // namespace RAJA
 
-#endif // RAJA_get_platform_HPP
+#endif  // RAJA_get_platform_HPP
diff --git a/include/RAJA/pattern/WorkGroup.hpp b/include/RAJA/pattern/WorkGroup.hpp
index 767821b8d8..740e2e64e9 100644
--- a/include/RAJA/pattern/WorkGroup.hpp
+++ b/include/RAJA/pattern/WorkGroup.hpp
@@ -38,39 +38,44 @@ namespace RAJA
  *
  * \verbatim
 
-   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> pool(allocator);
+   WorkPool<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator>
+ pool(allocator);
 
    pool.enqueue(..., [=] (Index_type i, int* xarg0, int xarg1) {
       xarg0[i] = xarg1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> group =
+ pool.instantiate();
 
    int* xarg0 = ...;
    int xarg1 = ...;
-   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site = group.run(xarg0, xarg1);
+   WorkSite<WorkGroup_policy, Index_type, xargs<int*, int>, Allocator> site =
+ group.run(xarg0, xarg1);
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template < typename ... Args >
+template<typename... Args>
 using xargs = camp::list<Args...>;
 
-namespace detail {
+namespace detail
+{
 
-template < typename T >
-struct is_xargs {
+template<typename T>
+struct is_xargs
+{
   static constexpr bool value = false;
 };
 
-template < typename ... Args >
-struct is_xargs<xargs<Args...>> {
+template<typename... Args>
+struct is_xargs<xargs<Args...>>
+{
   static constexpr bool value = true;
 };
 
-}
-
+}  // namespace detail
 
 //
 // Forward declarations for WorkPool and WorkGroup templates.
@@ -102,21 +107,24 @@ struct is_xargs<xargs<Args...>> {
       data[i] = 1;
    });
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
-struct WorkPool {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+template<typename WORKGROUP_POLICY_T,
+         typename INDEX_T,
+         typename EXTRA_ARGS_T,
+         typename ALLOCATOR_T>
+struct WorkPool
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkPool: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkPool: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -135,23 +143,27 @@ struct WorkPool {
  *
  * \verbatim
 
-   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group = pool.instantiate();
+   WorkGroup<WorkGroup_policy, Index_type, xargs<>, Allocator> group =
+ pool.instantiate();
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
  * \endverbatim
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
-struct WorkGroup {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+template<typename WORKGROUP_POLICY_T,
+         typename INDEX_T,
+         typename EXTRA_ARGS_T,
+         typename ALLOCATOR_T>
+struct WorkGroup
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkGroup: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkGroup: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
 /*!
@@ -170,7 +182,8 @@ struct WorkGroup {
  *
  * \verbatim
 
-   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site = group.run();
+   WorkSite<WorkGroup_policy, Index_type, xargs<>, Allocator> site =
+ group.run();
 
    site.synchronize();
 
@@ -178,25 +191,26 @@ struct WorkGroup {
  *
  ******************************************************************************
  */
-template <typename WORKGROUP_POLICY_T,
-          typename INDEX_T,
-          typename EXTRA_ARGS_T,
-          typename ALLOCATOR_T>
-struct WorkSite {
-  static_assert(RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
+template<typename WORKGROUP_POLICY_T,
+         typename INDEX_T,
+         typename EXTRA_ARGS_T,
+         typename ALLOCATOR_T>
+struct WorkSite
+{
+  static_assert(
+      RAJA::pattern_is<WORKGROUP_POLICY_T, RAJA::Pattern::workgroup>::value,
       "WorkSite: WORKGROUP_POLICY_T must be a workgroup policy");
   static_assert(detail::is_xargs<EXTRA_ARGS_T>::value,
-      "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
+                "WorkSite: EXTRA_ARGS_T must be a RAJA::xargs<...> type");
 };
 
-
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename ... Args,
-          typename ALLOCATOR_T>
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename INDEX_T,
+         typename... Args,
+         typename ALLOCATOR_T>
 struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
                                 STORAGE_POLICY_T,
@@ -205,23 +219,32 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
-  using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
+  using worksite_type  = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using workrunner_type = detail::WorkRunner<
-      exec_policy, order_policy, dispatch_policy, Allocator, index_type, Args...>;
-  using storage_type = detail::WorkStorage<
-      storage_policy, Allocator, typename workrunner_type::dispatcher_type>;
+  using workrunner_type = detail::WorkRunner<exec_policy,
+                                             order_policy,
+                                             dispatch_policy,
+                                             Allocator,
+                                             index_type,
+                                             Args...>;
+  using storage_type =
+      detail::WorkStorage<storage_policy,
+                          Allocator,
+                          typename workrunner_type::dispatcher_type>;
 
   friend workgroup_type;
   friend worksite_type;
@@ -229,52 +252,45 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workrunner_type::resource_type;
 
-  explicit WorkPool(Allocator const& aloc)
-    : m_storage(aloc)
-  { }
+  explicit WorkPool(Allocator const& aloc) : m_storage(aloc) {}
 
-  WorkPool(WorkPool const&) = delete;
+  WorkPool(WorkPool const&)            = delete;
   WorkPool& operator=(WorkPool const&) = delete;
 
-  WorkPool(WorkPool&&) = default;
+  WorkPool(WorkPool&&)            = default;
   WorkPool& operator=(WorkPool&&) = default;
 
-  size_t num_loops() const
-  {
-    return m_storage.size();
-  }
+  size_t num_loops() const { return m_storage.size(); }
 
-  size_t storage_bytes() const
-  {
-    return m_storage.storage_size();
-  }
+  size_t storage_bytes() const { return m_storage.storage_size(); }
 
   void reserve(size_t num_loops, size_t storage_bytes)
   {
     m_storage.reserve(num_loops, storage_bytes);
   }
 
-  template < typename segment_T, typename loop_T >
+  template<typename segment_T, typename loop_T>
   inline void enqueue(segment_T&& seg, loop_T&& loop_body)
   {
     {
       // ignore zero length loops
-      using std::begin; using std::end;
+      using std::begin;
+      using std::end;
       if (begin(seg) == end(seg)) return;
     }
-    if (m_storage.begin() == m_storage.end()) {
+    if (m_storage.begin() == m_storage.end())
+    {
       // perform auto-reserve on reuse
       reserve(m_max_num_loops, m_max_storage_bytes);
     }
 
-    util::PluginContext context{util::make_context<exec_policy>()};
+    util::PluginContext context {util::make_context<exec_policy>()};
     util::callPreCapturePlugins(context);
 
     using RAJA::util::trigger_updates_before;
     auto body = trigger_updates_before(loop_body);
 
-    m_runner.enqueue(
-        m_storage, std::forward<segment_T>(seg), std::move(body));
+    m_runner.enqueue(m_storage, std::forward<segment_T>(seg), std::move(body));
 
     util::callPostCapturePlugins(context);
   }
@@ -289,26 +305,23 @@ struct WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkPool()
-  {
-    clear();
-  }
+  ~WorkPool() { clear(); }
 
 private:
   storage_type m_storage;
-  size_t m_max_num_loops = 0;
+  size_t m_max_num_loops     = 0;
   size_t m_max_storage_bytes = 0;
 
   workrunner_type m_runner;
 };
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename ... Args,
-          typename ALLOCATOR_T>
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename INDEX_T,
+         typename... Args,
+         typename ALLOCATOR_T>
 struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                                  ORDER_POLICY_T,
                                  STORAGE_POLICY_T,
@@ -317,20 +330,23 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
                  xargs<Args...>,
                  ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
 
   using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
   using worksite_type = WorkSite<policy, index_type, xarg_type, Allocator>;
 
 private:
-  using storage_type = typename workpool_type::storage_type;
+  using storage_type    = typename workpool_type::storage_type;
   using workrunner_type = typename workpool_type::workrunner_type;
 
   friend workpool_type;
@@ -339,15 +355,16 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkGroup(WorkGroup const&) = delete;
+  WorkGroup(WorkGroup const&)            = delete;
   WorkGroup& operator=(WorkGroup const&) = delete;
 
-  WorkGroup(WorkGroup&&) = default;
+  WorkGroup(WorkGroup&&)            = default;
   WorkGroup& operator=(WorkGroup&&) = default;
 
   inline worksite_type run(resource_type r, Args...);
 
-  worksite_type run(Args... args) {
+  worksite_type run(Args... args)
+  {
     auto r = resource_type::get_default();
     return run(r, std::move(args)...);
   }
@@ -360,28 +377,25 @@ struct WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
     m_runner.clear();
   }
 
-  ~WorkGroup()
-  {
-    clear();
-  }
+  ~WorkGroup() { clear(); }
 
 private:
   storage_type m_storage;
   workrunner_type m_runner;
 
   WorkGroup(storage_type&& storage, workrunner_type&& runner)
-    : m_storage(std::move(storage))
-    , m_runner(std::move(runner))
-  { }
+      : m_storage(std::move(storage)),
+        m_runner(std::move(runner))
+  {}
 };
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename ... Args,
-          typename ALLOCATOR_T>
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename INDEX_T,
+         typename... Args,
+         typename ALLOCATOR_T>
 struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                                 ORDER_POLICY_T,
                                 STORAGE_POLICY_T,
@@ -390,16 +404,19 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
                 xargs<Args...>,
                 ALLOCATOR_T>
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
-  using storage_policy = STORAGE_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
+  using storage_policy  = STORAGE_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using policy = WorkGroupPolicy<exec_policy, order_policy, storage_policy, dispatch_policy>;
-  using index_type = INDEX_T;
-  using xarg_type = xargs<Args...>;
-  using Allocator = ALLOCATOR_T;
-
-  using workpool_type = WorkPool<policy, index_type, xarg_type, Allocator>;
+  using policy          = WorkGroupPolicy<exec_policy,
+                                 order_policy,
+                                 storage_policy,
+                                 dispatch_policy>;
+  using index_type      = INDEX_T;
+  using xarg_type       = xargs<Args...>;
+  using Allocator       = ALLOCATOR_T;
+
+  using workpool_type  = WorkPool<policy, index_type, xarg_type, Allocator>;
   using workgroup_type = WorkGroup<policy, index_type, xarg_type, Allocator>;
 
 private:
@@ -412,16 +429,13 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
 public:
   using resource_type = typename workpool_type::resource_type;
 
-  WorkSite(WorkSite const&) = delete;
+  WorkSite(WorkSite const&)            = delete;
   WorkSite& operator=(WorkSite const&) = delete;
 
-  WorkSite(WorkSite&&) = default;
+  WorkSite(WorkSite&&)            = default;
   WorkSite& operator=(WorkSite&&) = default;
 
-  resource_type get_resource() const
-  {
-    return m_resource;
-  }
+  resource_type get_resource() const { return m_resource; }
 
   void clear()
   {
@@ -429,78 +443,84 @@ struct WorkSite<WorkGroupPolicy<EXEC_POLICY_T,
     // TODO: synchronize
   }
 
-  ~WorkSite()
-  {
-    clear();
-  }
+  ~WorkSite() { clear(); }
 
 private:
   per_run_storage m_run_storage;
   resource_type m_resource;
 
   explicit WorkSite(resource_type r, per_run_storage&& run_storage)
-    : m_run_storage(std::move(run_storage))
-    , m_resource(r)
-  { }
+      : m_run_storage(std::move(run_storage)),
+        m_resource(r)
+  {}
 };
 
-
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename ... Args,
-          typename ALLOCATOR_T>
-inline
-typename WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::workgroup_type
-WorkPool<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::instantiate()
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename INDEX_T,
+         typename... Args,
+         typename ALLOCATOR_T>
+inline typename WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                                         ORDER_POLICY_T,
+                                         STORAGE_POLICY_T,
+                                         DISPATCH_POLICY_T>,
+                         INDEX_T,
+                         xargs<Args...>,
+                         ALLOCATOR_T>::workgroup_type
+WorkPool<WorkGroupPolicy<EXEC_POLICY_T,
+                         ORDER_POLICY_T,
+                         STORAGE_POLICY_T,
+                         DISPATCH_POLICY_T>,
+         INDEX_T,
+         xargs<Args...>,
+         ALLOCATOR_T>::instantiate()
 {
   // update max sizes to auto-reserve on reuse
-  m_max_num_loops = std::max(m_storage.size(), m_max_num_loops);
+  m_max_num_loops     = std::max(m_storage.size(), m_max_num_loops);
   m_max_storage_bytes = std::max(m_storage.storage_size(), m_max_storage_bytes);
 
   // move storage into workgroup
-  return workgroup_type{std::move(m_storage), std::move(m_runner)};
+  return workgroup_type {std::move(m_storage), std::move(m_runner)};
 }
 
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename STORAGE_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename INDEX_T,
-          typename ... Args,
-          typename ALLOCATOR_T>
-inline
-typename WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-    INDEX_T,
-    xargs<Args...>,
-    ALLOCATOR_T>::worksite_type
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename INDEX_T,
+         typename... Args,
+         typename ALLOCATOR_T>
+inline typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                          ORDER_POLICY_T,
+                                          STORAGE_POLICY_T,
+                                          DISPATCH_POLICY_T>,
+                          INDEX_T,
+                          xargs<Args...>,
+                          ALLOCATOR_T>::worksite_type
 WorkGroup<
-    WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
+    WorkGroupPolicy<EXEC_POLICY_T,
+                    ORDER_POLICY_T,
+                    STORAGE_POLICY_T,
+                    DISPATCH_POLICY_T>,
     INDEX_T,
     xargs<Args...>,
-    ALLOCATOR_T>::run(typename WorkGroup<
-                          WorkGroupPolicy<EXEC_POLICY_T, ORDER_POLICY_T, STORAGE_POLICY_T, DISPATCH_POLICY_T>,
-                          INDEX_T,
-                          xargs<Args...>,
-                          ALLOCATOR_T>::resource_type r,
+    ALLOCATOR_T>::run(typename WorkGroup<WorkGroupPolicy<EXEC_POLICY_T,
+                                                         ORDER_POLICY_T,
+                                                         STORAGE_POLICY_T,
+                                                         DISPATCH_POLICY_T>,
+                                         INDEX_T,
+                                         xargs<Args...>,
+                                         ALLOCATOR_T>::resource_type r,
                       Args... args)
 {
-  util::PluginContext context{util::make_context<EXEC_POLICY_T>()};
+  util::PluginContext context {util::make_context<EXEC_POLICY_T>()};
   util::callPreLaunchPlugins(context);
 
   // move any per run storage into worksite
-  worksite_type site(r, m_runner.run(m_storage, r, std::forward<Args>(args)...));
+  worksite_type site(r,
+                     m_runner.run(m_storage, r, std::forward<Args>(args)...));
 
   util::callPostLaunchPlugins(context);
 
diff --git a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
index 1eac283f4b..c74d433cf0 100644
--- a/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/pattern/WorkGroup/Dispatcher.hpp
@@ -29,42 +29,43 @@
 
 #include <utility>
 
-
 namespace RAJA
 {
 
 namespace detail
 {
 
-template < typename >
+template<typename>
 struct DispatcherVoidPtrWrapper
 {
   void* ptr;
   DispatcherVoidPtrWrapper() = default;
+
   // implicit constructor from void*
-  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidPtrWrapper(void* p) : ptr(p) {}
 };
 
-template < typename >
+template<typename>
 struct DispatcherVoidConstPtrWrapper
 {
   const void* ptr;
   DispatcherVoidConstPtrWrapper() = default;
+
   // implicit constructor from const void*
-  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) { }
+  RAJA_HOST_DEVICE DispatcherVoidConstPtrWrapper(const void* p) : ptr(p) {}
 };
 
-
-constexpr bool dispatcher_use_host_invoke(Platform platform) {
+constexpr bool dispatcher_use_host_invoke(Platform platform)
+{
   return !(platform == Platform::cuda || platform == Platform::hip);
 }
 
 // Transforms one dispatch policy into another by creating a dispatch policy
 // of holder_type objects. See usage in WorkRunner for more explanation.
-template < typename dispatch_policy, typename holder_type >
+template<typename dispatch_policy, typename holder_type>
 struct dispatcher_transform_types;
 ///
-template < typename dispatch_policy, typename holder_type >
+template<typename dispatch_policy, typename holder_type>
 using dispatcher_transform_types_t =
     typename dispatcher_transform_types<dispatch_policy, holder_type>::type;
 
@@ -75,12 +76,16 @@ using dispatcher_transform_types_t =
  * DispatcherID is used to differentiate function pointers based on their
  * function signature.
  */
-template < Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
+template<Platform platform,
+         typename dispatch_policy,
+         typename DispatcherID,
+         typename... CallArgs>
 struct Dispatcher;
 
-
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holder_type> {
+template<typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_function_call_dispatch;
 };
 
@@ -93,38 +98,45 @@ struct dispatcher_transform_types<::RAJA::indirect_function_call_dispatch, holde
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherID, CallArgs...> {
+template<Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_function_call_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_function_call_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_function_call_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  template < typename T >
-  static void s_move_construct_destroy(void_ptr_wrapper dest, void_ptr_wrapper src)
+  template<typename T>
+  static void s_move_construct_destroy(void_ptr_wrapper dest,
+                                       void_ptr_wrapper src)
   {
     T* dest_as_T = static_cast<T*>(dest.ptr);
-    T* src_as_T = static_cast<T*>(src.ptr);
-    new(dest_as_T) T(std::move(*src_as_T));
+    T* src_as_T  = static_cast<T*>(src.ptr);
+    new (dest_as_T) T(std::move(*src_as_T));
     (*src_as_T).~T();
   }
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  template < typename T >
+  template<typename T>
   static void s_host_invoke(void_cptr_wrapper obj, CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
   }
+
   ///
-  template < typename T >
-  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj, CallArgs... args)
+  template<typename T>
+  static RAJA_DEVICE void s_device_invoke(void_cptr_wrapper obj,
+                                          CallArgs... args)
   {
     const T* obj_as_T = static_cast<const T*>(obj.ptr);
     (*obj_as_T)(std::forward<CallArgs>(args)...);
@@ -133,22 +145,27 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// destroy the object of type T in obj
   ///
-  template < typename T >
+  template<typename T>
   static void s_destroy(void_ptr_wrapper obj)
   {
     T* obj_as_T = static_cast<T*>(obj.ptr);
     (*obj_as_T).~T();
   }
 
-  using mover_type = void(*)(void_ptr_wrapper /*dest*/, void_ptr_wrapper /*src*/);
-  using invoker_type = void(*)(void_cptr_wrapper /*obj*/, CallArgs... /*args*/);
-  using destroyer_type = void(*)(void_ptr_wrapper /*obj*/);
+  using mover_type     = void (*)(void_ptr_wrapper /*dest*/,
+                              void_ptr_wrapper /*src*/);
+  using invoker_type   = void (*)(void_cptr_wrapper /*obj*/,
+                                CallArgs... /*args*/);
+  using destroyer_type = void (*)(void_ptr_wrapper /*obj*/);
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceInvokerFactory {
+  template<typename T>
+  struct DeviceInvokerFactory
+  {
     using value_type = invoker_type;
-    RAJA_DEVICE value_type operator()() {
+
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -160,15 +177,16 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{&s_host_invoke<T>},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template<typename T,
+           bool uhi               = use_host_invoke,
+           std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {&s_host_invoke<T>}, destroyer_type {&s_destroy<T>},
+            sizeof(T)};
   }
+
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
   ///
@@ -179,14 +197,16 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
-    return { mover_type{&s_move_construct_destroy<T>},
-             invoker_type{std::forward<CreateOnDevice>(createOnDevice)(DeviceInvokerFactory<T>{})},
-             destroyer_type{&s_destroy<T>},
-             sizeof(T)
-           };
+  template<typename T,
+           typename CreateOnDevice,
+           bool uhi                = use_host_invoke,
+           std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
+    return {mover_type {&s_move_construct_destroy<T>},
+            invoker_type {std::forward<CreateOnDevice>(createOnDevice)(
+                DeviceInvokerFactory<T> {})},
+            destroyer_type {&s_destroy<T>}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -195,9 +215,10 @@ struct Dispatcher<platform, ::RAJA::indirect_function_call_dispatch, DispatcherI
   size_t size;
 };
 
-
-template < typename holder_type >
-struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, holder_type> {
+template<typename holder_type>
+struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch,
+                                  holder_type>
+{
   using type = ::RAJA::indirect_virtual_function_dispatch;
 };
 
@@ -210,38 +231,48 @@ struct dispatcher_transform_types<::RAJA::indirect_virtual_function_dispatch, ho
  * during device linking when functions with high register counts may cause
  * device linking to fail.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, DispatcherID, CallArgs...> {
+template<Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::indirect_virtual_function_dispatch,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::indirect_virtual_function_dispatch;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::indirect_virtual_function_dispatch;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  struct impl_base {
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const = 0;
-    virtual void destroy(void_ptr_wrapper obj) const = 0;
+  struct impl_base
+  {
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const = 0;
+    virtual void destroy(void_ptr_wrapper obj) const      = 0;
   };
 
-  struct host_impl_base {
+  struct host_impl_base
+  {
     virtual void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
   };
 
-  struct device_impl_base {
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const = 0;
+  struct device_impl_base
+  {
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const = 0;
   };
 
-  template < typename T >
+  template<typename T>
   struct base_impl_type : impl_base
   {
     ///
     /// move construct an object of type T in dest as a copy of a T from src and
     /// destroy the T obj in src
     ///
-    virtual void move_destroy(void_ptr_wrapper dest, void_ptr_wrapper src) const override
+    virtual void move_destroy(void_ptr_wrapper dest,
+                              void_ptr_wrapper src) const override
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
 
@@ -255,7 +286,7 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template<typename T>
   struct host_impl_type : host_impl_base
   {
     ///
@@ -268,59 +299,69 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
     }
   };
 
-  template < typename T >
+  template<typename T>
   struct device_impl_type : device_impl_base
   {
     ///
     /// invoke the call operator of the object of type T in obj with args
     ///
-    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj, CallArgs... args) const override
+    virtual RAJA_DEVICE void invoke(void_cptr_wrapper obj,
+                                    CallArgs... args) const override
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
 
-  struct mover_type {
+  struct mover_type
+  {
     impl_base* m_impl;
+
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       m_impl->move_destroy(dest, src);
     }
   };
 
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     host_impl_base* m_impl;
+
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
+
   ///
-  struct device_invoker_type {
+  struct device_invoker_type
+  {
     device_impl_base* m_impl;
+
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       m_impl->invoke(obj, std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
 
-  struct destroyer_type {
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
+
+  struct destroyer_type
+  {
     impl_base* m_impl;
-    void operator()(void_ptr_wrapper obj) const
-    {
-      m_impl->destroy(obj);
-    }
+
+    void operator()(void_ptr_wrapper obj) const { m_impl->destroy(obj); }
   };
 
   // This can't be a cuda device lambda due to compiler limitations
-  template < typename T >
-  struct DeviceImplTypeFactory {
+  template<typename T>
+  struct DeviceImplTypeFactory
+  {
     using value_type = device_impl_type<T>*;
-    RAJA_DEVICE value_type operator()() {
+
+    RAJA_DEVICE value_type operator()()
+    {
 #if defined(RAJA_ENABLE_HIP) && !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
       return nullptr;
 #else
@@ -333,17 +374,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
+  template<typename T,
+           bool uhi               = use_host_invoke,
+           std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
     static base_impl_type<T> s_base_impl;
     static host_impl_type<T> s_host_impl;
-    return { mover_type{&s_base_impl},
-             host_invoker_type{&s_host_impl},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    return {mover_type {&s_base_impl}, host_invoker_type {&s_host_impl},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
+
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
   ///
@@ -354,17 +395,17 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   /// to create the invoker object. This allows for a separation between
   /// object creation and the device context (cuda, hip, etc) and copying.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr>
-  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice) {
+  template<typename T,
+           typename CreateOnDevice,
+           bool uhi                = use_host_invoke,
+           std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&& createOnDevice)
+  {
     static base_impl_type<T> s_base_impl;
-    static device_impl_type<T>* s_device_impl_ptr{
-        std::forward<CreateOnDevice>(createOnDevice)(DeviceImplTypeFactory<T>{}) };
-    return { mover_type{&s_base_impl},
-             device_invoker_type{s_device_impl_ptr},
-             destroyer_type{&s_base_impl},
-             sizeof(T)
-           };
+    static device_impl_type<T>* s_device_impl_ptr {std::forward<CreateOnDevice>(
+        createOnDevice)(DeviceImplTypeFactory<T> {})};
+    return {mover_type {&s_base_impl}, device_invoker_type {s_device_impl_ptr},
+            destroyer_type {&s_base_impl}, sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -373,74 +414,87 @@ struct Dispatcher<platform, ::RAJA::indirect_virtual_function_dispatch, Dispatch
   size_t size;
 };
 
-
 // direct_dispatch expects a list of types
-template < typename ... Ts, typename holder_type >
-struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type> {
-  using type = ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
+template<typename... Ts, typename holder_type>
+struct dispatcher_transform_types<::RAJA::direct_dispatch<Ts...>, holder_type>
+{
+  using type =
+      ::RAJA::direct_dispatch<typename holder_type::template type<Ts>...>;
 };
 
 /*!
  * Version of Dispatcher that does direct dispatch to zero callable types.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...> {
+template<Platform platform, typename DispatcherID, typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
-    void operator()(void_ptr_wrapper, void_ptr_wrapper) const
-    { }
+  struct mover_type
+  {
+    void operator()(void_ptr_wrapper, void_ptr_wrapper) const {}
   };
 
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
-    void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+  struct host_invoker_type
+  {
+    void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  struct device_invoker_type {
-    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const
-    { }
+
+  struct device_invoker_type
+  {
+    RAJA_DEVICE void operator()(void_cptr_wrapper, CallArgs...) const {}
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
-    void operator()(void_ptr_wrapper) const
-    { }
+  struct destroyer_type
+  {
+    void operator()(void_ptr_wrapper) const {}
   };
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template<typename T,
+           bool uhi               = use_host_invoke,
+           std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
+
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
   ///
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template<typename T,
+           typename CreateOnDevice,
+           bool uhi                = use_host_invoke,
+           std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -453,23 +507,31 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<>, DispatcherID, CallArgs...
  * Version of Dispatcher that does direct dispatch to a single callable type.
  * It implements the interface with callable objects.
  */
-template < Platform platform, typename T, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs...> {
+template<Platform platform,
+         typename T,
+         typename DispatcherID,
+         typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy                 = ::RAJA::direct_dispatch<T>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -477,28 +539,32 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+
+  struct device_invoker_type
+  {
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     void operator()(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -509,23 +575,32 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename U,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, host_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template<typename U,
+           bool uhi               = use_host_invoke,
+           std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, host_invoker_type {}, destroyer_type {}, sizeof(T)};
   }
+
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
   ///
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename U, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static_assert(std::is_same<T, U>::value, "U must be in direct_dispatch types");
-    return {mover_type{}, device_invoker_type{}, destroyer_type{}, sizeof(T)};
+  template<typename U,
+           typename CreateOnDevice,
+           bool uhi                = use_host_invoke,
+           std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static_assert(std::is_same<T, U>::value,
+                  "U must be in direct_dispatch types");
+    return {mover_type {}, device_invoker_type {}, destroyer_type {},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
@@ -538,46 +613,55 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T>, DispatcherID, CallArgs..
  * Version of Dispatcher that does direct dispatch to multiple callable types.
  * It implements the interface with callable objects.
  */
-template < typename T0, typename T1, typename ... TNs,
-           Platform platform, typename DispatcherID, typename ... CallArgs >
-struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
-                  DispatcherID, CallArgs...> {
+template<typename T0,
+         typename T1,
+         typename... TNs,
+         Platform platform,
+         typename DispatcherID,
+         typename... CallArgs>
+struct Dispatcher<platform,
+                  ::RAJA::direct_dispatch<T0, T1, TNs...>,
+                  DispatcherID,
+                  CallArgs...>
+{
   static constexpr bool use_host_invoke = dispatcher_use_host_invoke(platform);
-  using dispatch_policy = ::RAJA::direct_dispatch<T0, T1, TNs...>;
-  using void_ptr_wrapper = DispatcherVoidPtrWrapper<DispatcherID>;
+  using dispatch_policy   = ::RAJA::direct_dispatch<T0, T1, TNs...>;
+  using void_ptr_wrapper  = DispatcherVoidPtrWrapper<DispatcherID>;
   using void_cptr_wrapper = DispatcherVoidConstPtrWrapper<DispatcherID>;
 
-  using id_type = int;
-  using callable_indices = camp::make_int_seq_t<id_type, 2+sizeof...(TNs)>;
-  using callable_types = camp::list<T0, T1, TNs...>;
+  using id_type          = int;
+  using callable_indices = camp::make_int_seq_t<id_type, 2 + sizeof...(TNs)>;
+  using callable_types   = camp::list<T0, T1, TNs...>;
 
   ///
   /// move construct an object of type T in dest as a copy of a T from src and
   /// destroy the T obj in src
   ///
-  struct mover_type {
+  struct mover_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  dest, src);
+      impl_helper(callable_indices {}, callable_types {}, dest, src);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper dest, void_ptr_wrapper src) const
+    template<int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper dest,
+                     void_ptr_wrapper src) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(dest, src), 0) : 0)...);
     }
 
-    template < typename T >
+    template<typename T>
     void impl(void_ptr_wrapper dest, void_ptr_wrapper src) const
     {
       T* dest_as_T = static_cast<T*>(dest.ptr);
-      T* src_as_T = static_cast<T*>(src.ptr);
-      new(dest_as_T) T(std::move(*src_as_T));
+      T* src_as_T  = static_cast<T*>(src.ptr);
+      new (dest_as_T) T(std::move(*src_as_T));
       (*src_as_T).~T();
     }
   };
@@ -585,79 +669,91 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   ///
   /// invoke the call operator of the object of type T in obj with args
   ///
-  struct host_invoker_type {
+  struct host_invoker_type
+  {
     id_type id;
 
     void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template<int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_cptr_wrapper obj,
+                     CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template<typename T>
     void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  struct device_invoker_type {
+
+  struct device_invoker_type
+  {
     id_type id;
 
     RAJA_DEVICE void operator()(void_cptr_wrapper obj, CallArgs... args) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj, std::forward<CallArgs>(args)...);
+      impl_helper(callable_indices {}, callable_types {}, obj,
+                  std::forward<CallArgs>(args)...);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_cptr_wrapper obj, CallArgs... args) const
+    template<int... id_types, typename... Ts>
+    RAJA_DEVICE void impl_helper(camp::int_seq<int, id_types...>,
+                                 camp::list<Ts...>,
+                                 void_cptr_wrapper obj,
+                                 CallArgs... args) const
     {
-      camp::sink(((id_types == id) ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0) : 0)...);
+      camp::sink(((id_types == id)
+                      ? (impl<Ts>(obj, std::forward<CallArgs>(args)...), 0)
+                      : 0)...);
     }
 
-    template < typename T >
+    template<typename T>
     RAJA_DEVICE void impl(void_cptr_wrapper obj, CallArgs... args) const
     {
       const T* obj_as_T = static_cast<const T*>(obj.ptr);
       (*obj_as_T)(std::forward<CallArgs>(args)...);
     }
   };
-  using invoker_type = std::conditional_t<use_host_invoke,
-                                          host_invoker_type,
-                                          device_invoker_type>;
+
+  using invoker_type = std::
+      conditional_t<use_host_invoke, host_invoker_type, device_invoker_type>;
 
   ///
   /// destroy the object of type T in obj
   ///
-  struct destroyer_type {
+  struct destroyer_type
+  {
     id_type id;
 
     void operator()(void_ptr_wrapper obj) const
     {
-      impl_helper(callable_indices{}, callable_types{},
-                  obj);
+      impl_helper(callable_indices {}, callable_types {}, obj);
     }
 
   private:
-    template < int ... id_types, typename ... Ts >
-    void impl_helper(camp::int_seq<int, id_types...>, camp::list<Ts...>,
-              void_ptr_wrapper obj) const
+    template<int... id_types, typename... Ts>
+    void impl_helper(camp::int_seq<int, id_types...>,
+                     camp::list<Ts...>,
+                     void_ptr_wrapper obj) const
     {
       camp::sink(((id_types == id) ? (impl<Ts>(obj), 0) : 0)...);
     }
 
-    template < typename T >
+    template<typename T>
     void impl(void_ptr_wrapper obj) const
     {
       T* obj_as_T = static_cast<T*>(obj.ptr);
@@ -671,38 +767,50 @@ struct Dispatcher<platform, ::RAJA::direct_dispatch<T0, T1, TNs...>,
   /// The id is just the index of T in the list of callable_types.
   /// If T is not in Ts return -1.
   ///
-  template < typename T, int ... id_types, typename ... Ts >
-  static constexpr id_type get_id(camp::int_seq<int, id_types...>, camp::list<Ts...>)
+  template<typename T, int... id_types, typename... Ts>
+  static constexpr id_type get_id(camp::int_seq<int, id_types...>,
+                                  camp::list<Ts...>)
   {
-    id_type id{-1};
+    id_type id {-1};
     // quiet UB warning by sequencing assignment to id with list initialization
-    int unused[] {0, (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
-    camp::sink(unused); // quiet unused var warning
+    int unused[] {0,
+                  (std::is_same<T, Ts>::value ? ((id = id_types), 0) : 0)...};
+    camp::sink(unused);  // quiet unused var warning
     return id;
   }
 
   ///
   /// create a Dispatcher that can be used on the host for objects of type T
   ///
-  template< typename T,
-            bool uhi = use_host_invoke, std::enable_if_t<uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher() {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template<typename T,
+           bool uhi               = use_host_invoke,
+           std::enable_if_t<uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher()
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, host_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, host_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
+
   ///
   /// create a Dispatcher that can be used on the device for objects of type T
   ///
   /// Ignore the CreateOnDevice object as the same invoker object can be used
   /// on the host and device.
   ///
-  template< typename T, typename CreateOnDevice,
-            bool uhi = use_host_invoke, std::enable_if_t<!uhi>* = nullptr >
-  static inline Dispatcher makeDispatcher(CreateOnDevice&&) {
-    static constexpr id_type id = get_id<T>(callable_indices{}, callable_types{});
+  template<typename T,
+           typename CreateOnDevice,
+           bool uhi                = use_host_invoke,
+           std::enable_if_t<!uhi>* = nullptr>
+  static inline Dispatcher makeDispatcher(CreateOnDevice&&)
+  {
+    static constexpr id_type id =
+        get_id<T>(callable_indices {}, callable_types {});
     static_assert(id != id_type(-1), "T must be in direct_dispatch types");
-    return {mover_type{id}, device_invoker_type{id}, destroyer_type{id}, sizeof(T)};
+    return {mover_type {id}, device_invoker_type {id}, destroyer_type {id},
+            sizeof(T)};
   }
 
   mover_type move_construct_destroy;
diff --git a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
index 9645f73050..112ad14d38 100644
--- a/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkRunner.hpp
@@ -30,7 +30,6 @@
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 #include "RAJA/policy/WorkGroup.hpp"
 
-
 namespace RAJA
 {
 
@@ -40,18 +39,18 @@ namespace detail
 /*!
  * A body and args holder for storing loops that are being executed in foralls
  */
-template <typename LoopBody, typename ... Args>
+template<typename LoopBody, typename... Args>
 struct HoldBodyArgs_base
 {
   // NOTE: This constructor is disabled when body_in is not LoopBody
   // to avoid it conflicting with the copy and move constructors
-  template < typename body_in,
-      typename = typename std::enable_if<
-        std::is_same<LoopBody, camp::decay<body_in>>::value>::type >
+  template<typename body_in,
+           typename = typename std::enable_if<
+               std::is_same<LoopBody, camp::decay<body_in>>::value>::type>
   HoldBodyArgs_base(body_in&& body, Args... args)
-    : m_body(std::forward<body_in>(body))
-    , m_arg_tuple(std::forward<Args>(args)...)
-  { }
+      : m_body(std::forward<body_in>(body)),
+        m_arg_tuple(std::forward<Args>(args)...)
+  {}
 
 protected:
   LoopBody m_body;
@@ -62,7 +61,7 @@ struct HoldBodyArgs_base
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the host
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template<typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -70,10 +69,10 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template<camp::idx_t... Is>
   RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -84,7 +83,7 @@ struct HoldBodyArgs_host : HoldBodyArgs_base<LoopBody, Args...>
  * A body and args holder for storing loops that are being executed in foralls
  * that run on the device
  */
-template <typename LoopBody, typename index_type, typename ... Args>
+template<typename LoopBody, typename index_type, typename... Args>
 struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 {
   using base = HoldBodyArgs_base<LoopBody, Args...>;
@@ -92,10 +91,10 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 
   RAJA_DEVICE RAJA_INLINE void operator()(index_type i) const
   {
-    invoke(i, camp::make_idx_seq_t<sizeof...(Args)>{});
+    invoke(i, camp::make_idx_seq_t<sizeof...(Args)> {});
   }
 
-  template < camp::idx_t ... Is >
+  template<camp::idx_t... Is>
   RAJA_DEVICE RAJA_INLINE void invoke(index_type i, camp::idx_seq<Is...>) const
   {
     this->m_body(i, get<Is>(this->m_arg_tuple)...);
@@ -105,28 +104,29 @@ struct HoldBodyArgs_device : HoldBodyArgs_base<LoopBody, Args...>
 /*!
  * A body and segment holder for storing loops that will be executed as foralls
  */
-template <typename ExecutionPolicy, typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template<typename ExecutionPolicy,
+         typename Segment_type,
+         typename LoopBody,
+         typename index_type,
+         typename... Args>
 struct HoldForall
 {
   using resource_type = typename resources::get_resource<ExecutionPolicy>::type;
-  using HoldBodyArgs = typename std::conditional<
+  using HoldBodyArgs  = typename std::conditional<
       !type_traits::is_device_exec_policy<ExecutionPolicy>::value,
       HoldBodyArgs_host<LoopBody, index_type, Args...>,
-      HoldBodyArgs_device<LoopBody, index_type, Args...> >::type;
+      HoldBodyArgs_device<LoopBody, index_type, Args...>>::type;
 
-  template < typename segment_in, typename body_in >
+  template<typename segment_in, typename body_in>
   HoldForall(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_INLINE void operator()(resource_type r, Args... args) const
   {
-    wrap::forall(r,
-                 ExecutionPolicy(),
-                 m_segment,
-                 HoldBodyArgs{m_body, std::forward<Args>(args)...});
+    wrap::forall(r, ExecutionPolicy(), m_segment,
+                 HoldBodyArgs {m_body, std::forward<Args>(args)...});
   }
 
 private:
@@ -138,46 +138,50 @@ struct HoldForall
 /*!
  * A class that handles running work in a work container
  */
-template <typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunner;
 
-
 /*!
  * Base class describing storage for ordered runners using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<typename FORALL_EXEC_POLICY,
+         typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunnerForallOrdered_base
 {
-  using exec_policy = EXEC_POLICY_T;
-  using order_policy = ORDER_POLICY_T;
+  using exec_policy     = EXEC_POLICY_T;
+  using order_policy    = ORDER_POLICY_T;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = typename resources::get_resource<FORALL_EXEC_POLICY>::type;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type =
+      typename resources::get_resource<FORALL_EXEC_POLICY>::type;
 
   using forall_exec_policy = FORALL_EXEC_POLICY;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
-    using type = HoldForall<forall_exec_policy,
-                            typename camp::at<T, camp::num<0>>::type, // segment_type
-                            typename camp::at<T, camp::num<1>>::type, // loop_type
-                            index_type, Args...>;
+  struct holder_type
+  {
+    template<typename T>
+    using type =
+        HoldForall<forall_exec_policy,
+                   typename camp::at<T, camp::num<0>>::type,  // segment_type
+                   typename camp::at<T, camp::num<1>>::type,  // loop_type
+                   index_type,
+                   Args...>;
   };
+
   ///
-  template < typename T >
+  template<typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -186,33 +190,40 @@ struct WorkRunnerForallOrdered_base
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::host, dispatcher_holder_policy, void, resource_type, Args...>;
+  using dispatcher_type = Dispatcher<Platform::host,
+                                     dispatcher_holder_policy,
+                                     void,
+                                     resource_type,
+                                     Args...>;
 
   WorkRunnerForallOrdered_base() = default;
 
   WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base const&) = delete;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) = delete;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base const&) =
+      delete;
 
-  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base &&) = default;
-  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base &&) = default;
+  WorkRunnerForallOrdered_base(WorkRunnerForallOrdered_base&&) = default;
+  WorkRunnerForallOrdered_base& operator=(WorkRunnerForallOrdered_base&&) =
+      default;
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename segment_T, typename loop_T >
+  template<typename WorkContainer, typename segment_T, typename loop_T>
   inline void enqueue(WorkContainer& storage, segment_T&& seg, loop_T&& loop)
   {
-    using holder = holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
+    using holder =
+        holder_type_t<camp::list<camp::decay<segment_T>, camp::decay<loop_T>>>;
 
     storage.template emplace<holder>(
-        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+        get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
         std::forward<segment_T>(seg), std::forward<loop_T>(loop));
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  { }
+  void clear() {}
 
   // no extra storage required here
   using per_run_storage = int;
@@ -221,45 +232,44 @@ struct WorkRunnerForallOrdered_base
 /*!
  * Runs work in a storage container in order using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<typename FORALL_EXEC_POLICY,
+         typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunnerForallOrdered
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
   // run the loops using forall in the order that they were enqueued
-  template < typename WorkContainer >
+  template<typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto end = storage.end();
-    for (auto iter = storage.begin(); iter != end; ++iter) {
+    for (auto iter = storage.begin(); iter != end; ++iter)
+    {
       value_type::host_call(&*iter, r, args...);
     }
 
@@ -270,46 +280,46 @@ struct WorkRunnerForallOrdered
 /*!
  * Runs work in a storage container in reverse order using forall
  */
-template <typename FORALL_EXEC_POLICY,
-          typename EXEC_POLICY_T,
-          typename ORDER_POLICY_T,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<typename FORALL_EXEC_POLICY,
+         typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunnerForallReverse
-    : WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>
+    : WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                   EXEC_POLICY_T,
+                                   ORDER_POLICY_T,
+                                   DISPATCH_POLICY_T,
+                                   ALLOCATOR_T,
+                                   INDEX_T,
+                                   Args...>
 {
-  using base = WorkRunnerForallOrdered_base<
-      FORALL_EXEC_POLICY,
-      EXEC_POLICY_T,
-      ORDER_POLICY_T,
-      DISPATCH_POLICY_T,
-      ALLOCATOR_T,
-      INDEX_T,
-      Args...>;
+  using base = WorkRunnerForallOrdered_base<FORALL_EXEC_POLICY,
+                                            EXEC_POLICY_T,
+                                            ORDER_POLICY_T,
+                                            DISPATCH_POLICY_T,
+                                            ALLOCATOR_T,
+                                            INDEX_T,
+                                            Args...>;
   using base::base;
 
-  // run the loops using forall in the reverse order to the order they were enqueued
-  template < typename WorkContainer >
+  // run the loops using forall in the reverse order to the order they were
+  // enqueued
+  template<typename WorkContainer>
   typename base::per_run_storage run(WorkContainer const& storage,
                                      typename base::resource_type r,
                                      Args... args) const
   {
     using value_type = typename WorkContainer::value_type;
 
-    typename base::per_run_storage run_storage{};
+    typename base::per_run_storage run_storage {};
 
     auto begin = storage.begin();
-    for (auto iter = storage.end(); iter != begin; --iter) {
-      value_type::host_call(&*(iter-1), r, args...);
+    for (auto iter = storage.end(); iter != begin; --iter)
+    {
+      value_type::host_call(&*(iter - 1), r, args...);
     }
 
     return run_storage;
diff --git a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
index 52631d108f..20c756e8b3 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStorage.hpp
@@ -32,7 +32,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkStruct.hpp"
 
-
 namespace RAJA
 {
 
@@ -46,34 +45,30 @@ namespace detail
 //   operator -  ( iterator_base const& )
 //   operator == ( iterator_base const& )
 //   operator <  ( iterator_base const& )
-template < typename iterator_base >
+template<typename iterator_base>
 struct random_access_iterator : iterator_base
 {
-  using base = iterator_base;
-  using value_type = const typename base::value_type;
-  using pointer = typename base::pointer;
-  using reference = typename base::reference;
-  using difference_type = typename base::difference_type;
+  using base              = iterator_base;
+  using value_type        = const typename base::value_type;
+  using pointer           = typename base::pointer;
+  using reference         = typename base::reference;
+  using difference_type   = typename base::difference_type;
   using iterator_category = std::random_access_iterator_tag;
 
   using base::base;
 
   random_access_iterator(random_access_iterator const&) = default;
-  random_access_iterator(random_access_iterator &&) = default;
+  random_access_iterator(random_access_iterator&&)      = default;
 
   random_access_iterator& operator=(random_access_iterator const&) = default;
-  random_access_iterator& operator=(random_access_iterator &&) = default;
-
+  random_access_iterator& operator=(random_access_iterator&&)      = default;
 
   RAJA_HOST_DEVICE reference operator*() const
   {
     return *static_cast<base const&>(*this);
   }
 
-  RAJA_HOST_DEVICE pointer operator->() const
-  {
-    return &(*(*this));
-  }
+  RAJA_HOST_DEVICE pointer operator->() const { return &(*(*this)); }
 
   RAJA_HOST_DEVICE reference operator[](difference_type i) const
   {
@@ -121,7 +116,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      random_access_iterator const& lhs, difference_type rhs)
+      random_access_iterator const& lhs,
+      difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy += rhs;
@@ -129,7 +125,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator+(
-      difference_type lhs, random_access_iterator const& rhs)
+      difference_type lhs,
+      random_access_iterator const& rhs)
   {
     random_access_iterator copy = rhs;
     copy += lhs;
@@ -137,7 +134,8 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline random_access_iterator operator-(
-      random_access_iterator const& lhs, difference_type rhs)
+      random_access_iterator const& lhs,
+      difference_type rhs)
   {
     random_access_iterator copy = lhs;
     copy -= rhs;
@@ -145,43 +143,50 @@ struct random_access_iterator : iterator_base
   }
 
   RAJA_HOST_DEVICE friend inline difference_type operator-(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) - static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator==(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) == static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator!=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs == rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator<(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return static_cast<base const&>(lhs) < static_cast<base const&>(rhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator<=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(rhs < lhs);
   }
 
   RAJA_HOST_DEVICE friend inline bool operator>(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return rhs < lhs;
   }
 
   RAJA_HOST_DEVICE friend inline bool operator>=(
-      random_access_iterator const& lhs, random_access_iterator const& rhs)
+      random_access_iterator const& lhs,
+      random_access_iterator const& rhs)
   {
     return !(lhs < rhs);
   }
@@ -191,10 +196,10 @@ struct random_access_iterator : iterator_base
 /*!
  * A storage container for work groups
  */
-template < typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T >
+template<typename STORAGE_POLICY_T, typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage;
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template<typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -202,25 +207,27 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::array_of_pointers;
+  using storage_policy  = RAJA::array_of_pointers;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template<typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
 private:
   // struct used in storage vector to retain pointer and allocation size
@@ -231,24 +238,19 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   };
 
 public:
-
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
-    const_iterator_base(const pointer_and_size* ptrptr)
-      : m_ptrptr(ptrptr)
-    { }
+    const_iterator_base(const pointer_and_size* ptrptr) : m_ptrptr(ptrptr) {}
 
-    RAJA_HOST_DEVICE reference operator*() const
-    {
-      return *(m_ptrptr->ptr);
-    }
+    RAJA_HOST_DEVICE reference operator*() const { return *(m_ptrptr->ptr); }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
     {
@@ -257,19 +259,22 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr - rhs_iter.m_ptrptr;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr == rhs_iter.m_ptrptr;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_ptrptr < rhs_iter.m_ptrptr;
     }
@@ -280,24 +285,25 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
   using const_iterator = random_access_iterator<const_iterator_base>;
 
-
   explicit WorkStorage(allocator_type const& aloc)
-    : m_vec(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_vec(0, aloc),
+        m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_vec(std::move(rhs.m_vec))
-    , m_aloc(std::move(rhs.m_aloc))
-  { }
+      : m_vec(std::move(rhs.m_vec)),
+        m_aloc(std::move(rhs.m_aloc))
+  {}
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -312,33 +318,26 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_vec.size();
-  }
+  size_type size() const { return m_vec.size(); }
 
-  const_iterator begin() const
-  {
-    return const_iterator(m_vec.begin());
-  }
+  const_iterator begin() const { return const_iterator(m_vec.begin()); }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_vec.end());
-  }
+  const_iterator end() const { return const_iterator(m_vec.end()); }
 
   // number of bytes used for storage of loops
   size_type storage_size() const
   {
     size_type storage_size_nbytes = 0;
-    for (size_t i = 0; i < m_vec.size(); ++i) {
+    for (size_t i = 0; i < m_vec.size(); ++i)
+    {
       storage_size_nbytes += m_vec[i].size;
     }
     return storage_size_nbytes;
   }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template<typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     m_vec.emplace_back(create_value<holder>(
         dispatcher, std::forward<holder_ctor_args>(ctor_args)...));
@@ -347,27 +346,28 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   // destroy all stored loops, deallocates all storage
   void clear()
   {
-    while (!m_vec.empty()) {
+    while (!m_vec.empty())
+    {
       destroy_value(m_vec.back());
       m_vec.pop_back();
     }
     m_vec.shrink_to_fit();
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<pointer_and_size, typename allocator_traits_type::template rebind_alloc<pointer_and_size>> m_vec;
+  RAJAVec<
+      pointer_and_size,
+      typename allocator_traits_type::template rebind_alloc<pointer_and_size>>
+      m_vec;
   allocator_type m_aloc;
 
   // move assignment if allocator propagates on move assignment
   void move_assign_private(WorkStorage&& rhs, std::true_type)
   {
     clear();
-    m_vec = std::move(rhs.m_vec);
+    m_vec  = std::move(rhs.m_vec);
     m_aloc = std::move(rhs.m_aloc);
   }
 
@@ -375,12 +375,16 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
       // take storage if allocators compare equal
       m_vec = std::move(rhs.m_vec);
-    } else {
+    }
+    else
+    {
       // allocate new storage if allocators do not compare equal
-      for (size_type i = 0; i < rhs.m_vec.size(); ++i) {
+      for (size_type i = 0; i < rhs.m_vec.size(); ++i)
+      {
         m_vec.emplace_back(move_destroy_value(std::move(rhs), rhs.m_vec[i]));
       }
       rhs.m_vec.clear();
@@ -389,7 +393,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
   }
 
   // allocate and construct value in storage
-  template < typename holder, typename ... holder_ctor_args >
+  template<typename holder, typename... holder_ctor_args>
   pointer_and_size create_value(const dispatcher_type* dispatcher,
                                 holder_ctor_args&&... ctor_args)
   {
@@ -401,7 +405,7 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
     value_type::template construct<holder>(
         value_ptr, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
 
-    return pointer_and_size{value_ptr, value_size};
+    return pointer_and_size {value_ptr, value_size};
   }
 
   // allocate and move construct object as copy of other value and
@@ -414,22 +418,24 @@ class WorkStorage<RAJA::array_of_pointers, ALLOCATOR_T, Dispatcher_T>
 
     value_type::move_destroy(value_ptr, other_value_and_size.ptr);
 
-    allocator_traits_type::deallocate(rhs.m_aloc,
-        reinterpret_cast<char*>(other_value_and_size.ptr), other_value_and_size.size);
+    allocator_traits_type::deallocate(
+        rhs.m_aloc, reinterpret_cast<char*>(other_value_and_size.ptr),
+        other_value_and_size.size);
 
-    return pointer_and_size{value_ptr, other_value_and_size.size};
+    return pointer_and_size {value_ptr, other_value_and_size.size};
   }
 
   // destroy and deallocate value
   void destroy_value(pointer_and_size value_and_size_ptr)
   {
     value_type::destroy(value_and_size_ptr.ptr);
-    allocator_traits_type::deallocate(m_aloc,
-        reinterpret_cast<char*>(value_and_size_ptr.ptr), value_and_size_ptr.size);
+    allocator_traits_type::deallocate(
+        m_aloc, reinterpret_cast<char*>(value_and_size_ptr.ptr),
+        value_and_size_ptr.size);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template<typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 {
   using allocator_traits_type = std::allocator_traits<ALLOCATOR_T>;
@@ -437,44 +443,46 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::ragged_array_of_objects;
+  using storage_policy  = RAJA::ragged_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template<typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_begin, const size_type* offset_iter)
-      : m_array_begin(array_begin)
-      , m_offset_iter(offset_iter)
-    { }
+        : m_array_begin(array_begin),
+          m_offset_iter(offset_iter)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
-      return *reinterpret_cast<pointer>(
-          m_array_begin + *m_offset_iter);
+      return *reinterpret_cast<pointer>(m_array_begin + *m_offset_iter);
     }
 
     RAJA_HOST_DEVICE const_iterator_base& operator+=(difference_type n)
@@ -484,19 +492,22 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter - rhs_iter.m_offset_iter;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter == rhs_iter.m_offset_iter;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_offset_iter < rhs_iter.m_offset_iter;
     }
@@ -508,31 +519,32 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   using const_iterator = random_access_iterator<const_iterator_base>;
 
-
   explicit WorkStorage(allocator_type const& aloc)
-    : m_offsets(0, aloc)
-    , m_aloc(aloc)
-  { }
+      : m_offsets(0, aloc),
+        m_aloc(aloc)
+  {}
 
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_offsets(std::move(rhs.m_offsets))
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
-    , m_aloc(std::move(rhs.m_aloc))
+      : m_offsets(std::move(rhs.m_offsets)),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap),
+        m_aloc(std::move(rhs.m_aloc))
   {
     rhs.m_array_begin = nullptr;
-    rhs.m_array_end = nullptr;
-    rhs.m_array_cap = nullptr;
+    rhs.m_array_end   = nullptr;
+    rhs.m_array_cap   = nullptr;
   }
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -546,10 +558,7 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return m_offsets.size();
-  }
+  size_type size() const { return m_offsets.size(); }
 
   const_iterator begin() const
   {
@@ -562,17 +571,15 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // number of bytes used for storage of loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template<typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
     size_type value_offset = storage_size();
-    size_type value_size   = create_value<holder>(value_offset,
-        dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    size_type value_size   = create_value<holder>(
+        value_offset, dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
     m_offsets.emplace_back(value_offset);
     m_array_end += value_size;
   }
@@ -581,21 +588,22 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
-  RAJAVec<size_type, typename allocator_traits_type::template rebind_alloc<size_type>> m_offsets;
+  RAJAVec<size_type,
+          typename allocator_traits_type::template rebind_alloc<size_type>>
+      m_offsets;
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -608,8 +616,8 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
     m_offsets     = std::move(rhs.m_offsets);
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
     m_aloc        = std::move(rhs.m_aloc);
 
     rhs.m_array_begin = nullptr;
@@ -621,25 +629,29 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
       m_offsets     = std::move(rhs.m_offsets);
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
       array_reserve(rhs.storage_size());
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         m_array_end = m_array_begin + rhs.m_offsets[i];
         move_destroy_value(m_array_end, rhs.m_array_begin + rhs.m_offsets[i]);
         m_offsets.emplace_back(rhs.m_offsets[i]);
       }
-      m_array_end = m_array_begin + rhs.storage_size();
+      m_array_end     = m_array_begin + rhs.storage_size();
       rhs.m_array_end = rhs.m_array_begin;
       rhs.m_offsets.clear();
       rhs.clear();
@@ -647,46 +659,45 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   }
 
   // get loop storage capacity, used and unused in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // get unused loop storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // reserve space for loop_storage_size bytes of loop storage
   void array_reserve(size_type loop_storage_size)
   {
-    if (loop_storage_size > storage_capacity()) {
+    if (loop_storage_size > storage_capacity())
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + storage_size();
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + storage_size();
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + m_offsets[i],
-                             m_array_begin + m_offsets[i]);
+                           m_array_begin + m_offsets[i]);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy loop objects (does not deallocate array storage)
   void array_clear()
   {
-    while (!m_offsets.empty()) {
+    while (!m_offsets.empty())
+    {
       destroy_value(m_offsets.back());
       m_array_end = m_array_begin + m_offsets.back();
       m_offsets.pop_back();
@@ -696,15 +707,17 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
 
   // ensure there is enough storage to hold the next loop body at value offset
   // and store the loop body
-  template < typename holder, typename ... holder_ctor_args >
+  template<typename holder, typename... holder_ctor_args>
   size_type create_value(size_type value_offset,
                          const dispatcher_type* dispatcher,
                          holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused()) {
-      array_reserve(std::max(storage_size() + value_size, 2*storage_capacity()));
+    if (value_size > storage_unused())
+    {
+      array_reserve(
+          std::max(storage_size() + value_size, 2 * storage_capacity()));
     }
 
     pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
@@ -726,13 +739,12 @@ class WorkStorage<RAJA::ragged_array_of_objects, ALLOCATOR_T, Dispatcher_T>
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
 
-template < typename ALLOCATOR_T, typename Dispatcher_T >
+template<typename ALLOCATOR_T, typename Dispatcher_T>
 class WorkStorage<RAJA::constant_stride_array_of_objects,
                   ALLOCATOR_T,
                   Dispatcher_T>
@@ -742,39 +754,42 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
       typename allocator_traits_type::propagate_on_container_copy_assignment;
   using propagate_on_container_move_assignment =
       typename allocator_traits_type::propagate_on_container_move_assignment;
-  using propagate_on_container_swap            =
+  using propagate_on_container_swap =
       typename allocator_traits_type::propagate_on_container_swap;
-  static_assert(std::is_same<typename allocator_traits_type::value_type, char>::value,
+  static_assert(
+      std::is_same<typename allocator_traits_type::value_type, char>::value,
       "WorkStorage expects an allocator for 'char's.");
+
 public:
-  using storage_policy = RAJA::constant_stride_array_of_objects;
+  using storage_policy  = RAJA::constant_stride_array_of_objects;
   using dispatcher_type = Dispatcher_T;
 
-  template < typename holder >
+  template<typename holder>
   using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
 
-  using value_type = GenericWorkStruct<dispatcher_type>;
-  using allocator_type = ALLOCATOR_T;
-  using size_type = std::size_t;
+  using value_type      = GenericWorkStruct<dispatcher_type>;
+  using allocator_type  = ALLOCATOR_T;
+  using size_type       = std::size_t;
   using difference_type = std::ptrdiff_t;
-  using reference = value_type&;
+  using reference       = value_type&;
   using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
+  using pointer         = value_type*;
+  using const_pointer   = const value_type*;
 
-  // iterator base class for accessing stored WorkStructs outside of the container
+  // iterator base class for accessing stored WorkStructs outside of the
+  // container
   struct const_iterator_base
   {
-    using value_type = const typename WorkStorage::value_type;
-    using pointer = typename WorkStorage::const_pointer;
-    using reference = typename WorkStorage::const_reference;
-    using difference_type = typename WorkStorage::difference_type;
+    using value_type        = const typename WorkStorage::value_type;
+    using pointer           = typename WorkStorage::const_pointer;
+    using reference         = typename WorkStorage::const_reference;
+    using difference_type   = typename WorkStorage::difference_type;
     using iterator_category = std::random_access_iterator_tag;
 
     const_iterator_base(const char* array_pos, size_type stride)
-      : m_array_pos(array_pos)
-      , m_stride(stride)
-    { }
+        : m_array_pos(array_pos),
+          m_stride(stride)
+    {}
 
     RAJA_HOST_DEVICE reference operator*() const
     {
@@ -788,19 +803,22 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     }
 
     RAJA_HOST_DEVICE friend inline difference_type operator-(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return (lhs_iter.m_array_pos - rhs_iter.m_array_pos) / lhs_iter.m_stride;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator==(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos == rhs_iter.m_array_pos;
     }
 
     RAJA_HOST_DEVICE friend inline bool operator<(
-        const_iterator_base const& lhs_iter, const_iterator_base const& rhs_iter)
+        const_iterator_base const& lhs_iter,
+        const_iterator_base const& rhs_iter)
     {
       return lhs_iter.m_array_pos < rhs_iter.m_array_pos;
     }
@@ -812,20 +830,17 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   using const_iterator = random_access_iterator<const_iterator_base>;
 
+  explicit WorkStorage(allocator_type const& aloc) : m_aloc(aloc) {}
 
-  explicit WorkStorage(allocator_type const& aloc)
-    : m_aloc(aloc)
-  { }
-
-  WorkStorage(WorkStorage const&) = delete;
+  WorkStorage(WorkStorage const&)            = delete;
   WorkStorage& operator=(WorkStorage const&) = delete;
 
   WorkStorage(WorkStorage&& rhs)
-    : m_aloc(std::move(rhs.m_aloc))
-    , m_stride(rhs.m_stride)
-    , m_array_begin(rhs.m_array_begin)
-    , m_array_end(rhs.m_array_end)
-    , m_array_cap(rhs.m_array_cap)
+      : m_aloc(std::move(rhs.m_aloc)),
+        m_stride(rhs.m_stride),
+        m_array_begin(rhs.m_array_begin),
+        m_array_end(rhs.m_array_end),
+        m_array_cap(rhs.m_array_cap)
   {
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -835,8 +850,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   WorkStorage& operator=(WorkStorage&& rhs)
   {
-    if (this != &rhs) {
-      move_assign_private(std::move(rhs), propagate_on_container_move_assignment{});
+    if (this != &rhs)
+    {
+      move_assign_private(std::move(rhs),
+                          propagate_on_container_move_assignment {});
     }
     return *this;
   }
@@ -847,35 +864,28 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   {
     size_type num_storage_loops =
         std::max(num_loops, (loop_storage_size + m_stride - 1) / m_stride);
-    array_reserve(num_storage_loops*m_stride, m_stride);
+    array_reserve(num_storage_loops * m_stride, m_stride);
   }
 
   // number of loops stored
-  size_type size() const
-  {
-    return storage_size() / m_stride;
-  }
+  size_type size() const { return storage_size() / m_stride; }
 
   const_iterator begin() const
   {
     return const_iterator(m_array_begin, m_stride);
   }
 
-  const_iterator end() const
-  {
-    return const_iterator(m_array_end, m_stride);
-  }
+  const_iterator end() const { return const_iterator(m_array_end, m_stride); }
 
   // amount of storage in bytes used to store loops
-  size_type storage_size() const
-  {
-    return m_array_end - m_array_begin;
-  }
+  size_type storage_size() const { return m_array_end - m_array_begin; }
 
-  template < typename holder, typename ... holder_ctor_args >
-  void emplace(const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template<typename holder, typename... holder_ctor_args>
+  void emplace(const dispatcher_type* dispatcher,
+               holder_ctor_args&&... ctor_args)
   {
-    create_value<holder>(dispatcher, std::forward<holder_ctor_args>(ctor_args)...);
+    create_value<holder>(dispatcher,
+                         std::forward<holder_ctor_args>(ctor_args)...);
     m_array_end += m_stride;
   }
 
@@ -883,22 +893,21 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void clear()
   {
     array_clear();
-    if (m_array_begin != nullptr) {
-      allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+    if (m_array_begin != nullptr)
+    {
+      allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                        storage_capacity());
       m_array_begin = nullptr;
       m_array_end   = nullptr;
       m_array_cap   = nullptr;
     }
   }
 
-  ~WorkStorage()
-  {
-    clear();
-  }
+  ~WorkStorage() { clear(); }
 
 private:
   allocator_type m_aloc;
-  size_type m_stride     = 1; // can't be 0 because size divides stride
+  size_type m_stride  = 1;  // can't be 0 because size divides stride
   char* m_array_begin = nullptr;
   char* m_array_end   = nullptr;
   char* m_array_cap   = nullptr;
@@ -909,10 +918,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
     clear();
 
     m_aloc        = std::move(rhs.m_aloc);
-    m_stride      = rhs.m_stride     ;
+    m_stride      = rhs.m_stride;
     m_array_begin = rhs.m_array_begin;
-    m_array_end   = rhs.m_array_end  ;
-    m_array_cap   = rhs.m_array_cap  ;
+    m_array_end   = rhs.m_array_end;
+    m_array_cap   = rhs.m_array_cap;
 
     // do not reset stride, leave it for reuse
     rhs.m_array_begin = nullptr;
@@ -924,23 +933,27 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   void move_assign_private(WorkStorage&& rhs, std::false_type)
   {
     clear();
-    if (m_aloc == rhs.m_aloc) {
+    if (m_aloc == rhs.m_aloc)
+    {
 
-      m_stride      = rhs.m_stride     ;
+      m_stride      = rhs.m_stride;
       m_array_begin = rhs.m_array_begin;
-      m_array_end   = rhs.m_array_end  ;
-      m_array_cap   = rhs.m_array_cap  ;
+      m_array_end   = rhs.m_array_end;
+      m_array_cap   = rhs.m_array_cap;
 
       // do not reset stride, leave it for reuse
       rhs.m_array_begin = nullptr;
       rhs.m_array_end   = nullptr;
       rhs.m_array_cap   = nullptr;
-    } else {
+    }
+    else
+    {
 
       m_stride = rhs.m_stride;
       array_reserve(rhs.storage_size(), rhs.m_stride);
 
-      for (size_type i = 0; i < rhs.size(); ++i) {
+      for (size_type i = 0; i < rhs.size(); ++i)
+      {
         move_destroy_value(m_array_end, rhs.m_array_begin + i * rhs.m_stride);
         m_array_end += m_stride;
       }
@@ -950,16 +963,10 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   }
 
   // storage capacity, used and unused, in bytes
-  size_type storage_capacity() const
-  {
-    return m_array_cap - m_array_begin;
-  }
+  size_type storage_capacity() const { return m_array_cap - m_array_begin; }
 
   // unused storage capacity in bytes
-  size_type storage_unused() const
-  {
-    return m_array_cap - m_array_end;
-  }
+  size_type storage_unused() const { return m_array_cap - m_array_end; }
 
   // allocate enough storage for loop_storage_size bytes with
   // each loop body separated by new_stride bytes
@@ -968,33 +975,39 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // Note that loop_storage_size must be a multiple of new_stride
   void array_reserve(size_type loop_storage_size, size_type new_stride)
   {
-    if (loop_storage_size > storage_capacity() || new_stride > m_stride) {
+    if (loop_storage_size > storage_capacity() || new_stride > m_stride)
+    {
 
       char* new_array_begin =
           allocator_traits_type::allocate(m_aloc, loop_storage_size);
-      char* new_array_end   = new_array_begin + size() * new_stride;
-      char* new_array_cap   = new_array_begin + loop_storage_size;
+      char* new_array_end = new_array_begin + size() * new_stride;
+      char* new_array_cap = new_array_begin + loop_storage_size;
 
-      for (size_type i = 0; i < size(); ++i) {
+      for (size_type i = 0; i < size(); ++i)
+      {
         move_destroy_value(new_array_begin + i * new_stride,
-                             m_array_begin + i *   m_stride);
+                           m_array_begin + i * m_stride);
       }
 
-      if (m_array_begin != nullptr) {
-        allocator_traits_type::deallocate(m_aloc, m_array_begin, storage_capacity());
+      if (m_array_begin != nullptr)
+      {
+        allocator_traits_type::deallocate(m_aloc, m_array_begin,
+                                          storage_capacity());
       }
 
-      m_stride      = new_stride     ;
+      m_stride      = new_stride;
       m_array_begin = new_array_begin;
-      m_array_end   = new_array_end  ;
-      m_array_cap   = new_array_cap  ;
+      m_array_end   = new_array_end;
+      m_array_cap   = new_array_cap;
     }
   }
 
   // destroy the loops in storage (does not deallocate loop storage)
   void array_clear()
   {
-    for (size_type value_offset = storage_size(); value_offset > 0; value_offset -= m_stride) {
+    for (size_type value_offset = storage_size(); value_offset > 0;
+         value_offset -= m_stride)
+    {
       destroy_value(value_offset - m_stride);
       m_array_end -= m_stride;
     }
@@ -1002,18 +1015,20 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // ensure there is enough storage to store the loop body
   // and construct the body in storage.
-  template < typename holder, typename ... holder_ctor_args >
+  template<typename holder, typename... holder_ctor_args>
   void create_value(const dispatcher_type* dispatcher,
                     holder_ctor_args&&... ctor_args)
   {
     const size_type value_size = sizeof(true_value_type<holder>);
 
-    if (value_size > storage_unused() && value_size <= m_stride) {
-      array_reserve(std::max(storage_size() + m_stride, 2*storage_capacity()),
+    if (value_size > storage_unused() && value_size <= m_stride)
+    {
+      array_reserve(std::max(storage_size() + m_stride, 2 * storage_capacity()),
                     m_stride);
-    } else if (value_size > m_stride) {
-      array_reserve((size()+1)*value_size,
-                    value_size);
+    }
+    else if (value_size > m_stride)
+    {
+      array_reserve((size() + 1) * value_size, value_size);
     }
 
     size_type value_offset = storage_size();
@@ -1025,8 +1040,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
 
   // move construct the loop body in value from other and
   // destroy the loop body in other
-  void move_destroy_value(char* value_ptr,
-                          char* other_value_ptr)
+  void move_destroy_value(char* value_ptr, char* other_value_ptr)
   {
     value_type::move_destroy(reinterpret_cast<pointer>(value_ptr),
                              reinterpret_cast<pointer>(other_value_ptr));
@@ -1035,8 +1049,7 @@ class WorkStorage<RAJA::constant_stride_array_of_objects,
   // destroy the loop body at value offset
   void destroy_value(size_type value_offset)
   {
-    pointer value_ptr =
-        reinterpret_cast<pointer>(m_array_begin + value_offset);
+    pointer value_ptr = reinterpret_cast<pointer>(m_array_begin + value_offset);
     value_type::destroy(value_ptr);
   }
 };
diff --git a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
index 72e1540c54..0c799efd18 100644
--- a/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
+++ b/include/RAJA/pattern/WorkGroup/WorkStruct.hpp
@@ -25,7 +25,6 @@
 
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
-
 namespace RAJA
 {
 
@@ -35,7 +34,7 @@ namespace detail
 /*!
  * A struct that gives a generic way to layout memory for different loops
  */
-template < size_t size, typename Dispatcher_T >
+template<size_t size, typename Dispatcher_T>
 struct WorkStruct;
 
 /*!
@@ -44,67 +43,76 @@ struct WorkStruct;
  *   offsetof(GenericWorkStruct<>, obj) == offsetof(WorkStruct<size>, obj)
  *   sizeof(GenericWorkStruct) <= sizeof(WorkStruct<size>)
  */
-template < typename Dispatcher_T >
+template<typename Dispatcher_T>
 using GenericWorkStruct = WorkStruct<RAJA_MAX_ALIGN, Dispatcher_T>;
 
-template < size_t size, Platform platform, typename dispatch_policy, typename DispatcherID, typename ... CallArgs >
-struct WorkStruct<size, Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
+template<size_t size,
+         Platform platform,
+         typename dispatch_policy,
+         typename DispatcherID,
+         typename... CallArgs>
+struct WorkStruct<
+    size,
+    Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>>
 {
-  using dispatcher_type = Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
+  using dispatcher_type =
+      Dispatcher<platform, dispatch_policy, DispatcherID, CallArgs...>;
 
   // construct a WorkStruct with a value of type holder from the args and
   // check a variety of constraints at compile time
-  template < typename holder, typename ... holder_ctor_args >
-  static RAJA_INLINE
-  void construct(void* ptr, const dispatcher_type* dispatcher, holder_ctor_args&&... ctor_args)
+  template<typename holder, typename... holder_ctor_args>
+  static RAJA_INLINE void construct(void* ptr,
+                                    const dispatcher_type* dispatcher,
+                                    holder_ctor_args&&... ctor_args)
   {
     using true_value_type = WorkStruct<sizeof(holder), dispatcher_type>;
-    using value_type = GenericWorkStruct<dispatcher_type>;
+    using value_type      = GenericWorkStruct<dispatcher_type>;
 
     static_assert(sizeof(holder) <= sizeof(true_value_type::obj),
-        "holder must fit in WorkStruct::obj");
+                  "holder must fit in WorkStruct::obj");
     static_assert(std::is_standard_layout<true_value_type>::value,
-        "WorkStruct must be a standard layout type");
+                  "WorkStruct must be a standard layout type");
     static_assert(std::is_standard_layout<value_type>::value,
-        "GenericWorkStruct must be a standard layout type");
-    static_assert(offsetof(value_type, obj) == offsetof(true_value_type, obj),
+                  "GenericWorkStruct must be a standard layout type");
+    static_assert(
+        offsetof(value_type, obj) == offsetof(true_value_type, obj),
         "WorkStruct and GenericWorkStruct must have obj at the same offset");
     static_assert(sizeof(value_type) <= sizeof(true_value_type),
-        "WorkStruct must not be smaller than GenericWorkStruct");
+                  "WorkStruct must not be smaller than GenericWorkStruct");
     true_value_type* value_ptr = static_cast<true_value_type*>(ptr);
 
     value_ptr->dispatcher = dispatcher;
-    value_ptr->invoke = dispatcher->invoke;
-    new(&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
+    value_ptr->invoke     = dispatcher->invoke;
+    new (&value_ptr->obj) holder(std::forward<holder_ctor_args>(ctor_args)...);
   }
 
   // move construct in dst from the value in src and destroy the value in src
-  static RAJA_INLINE
-  void move_destroy(WorkStruct* value_dst,
-                    WorkStruct* value_src)
+  static RAJA_INLINE void move_destroy(WorkStruct* value_dst,
+                                       WorkStruct* value_src)
   {
     value_dst->dispatcher = value_src->dispatcher;
-    value_dst->invoke = value_src->invoke;
-    value_dst->dispatcher->move_construct_destroy(&value_dst->obj, &value_src->obj);
+    value_dst->invoke     = value_src->invoke;
+    value_dst->dispatcher->move_construct_destroy(&value_dst->obj,
+                                                  &value_src->obj);
   }
 
   // destroy the value ptr
-  static RAJA_INLINE
-  void destroy(WorkStruct* value_ptr)
+  static RAJA_INLINE void destroy(WorkStruct* value_ptr)
   {
     value_ptr->dispatcher->destroy(&value_ptr->obj);
   }
 
   // invoke the call operator of the value ptr with args
-  static RAJA_INLINE
-  void host_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_INLINE void host_call(const WorkStruct* value_ptr,
+                                    CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
+
   ///
   // invoke the call operator of the value ptr with args
-  static RAJA_DEVICE RAJA_INLINE
-  void device_call(const WorkStruct* value_ptr, CallArgs... args)
+  static RAJA_DEVICE RAJA_INLINE void device_call(const WorkStruct* value_ptr,
+                                                  CallArgs... args)
   {
     value_ptr->invoke(&value_ptr->obj, std::forward<CallArgs>(args)...);
   }
diff --git a/include/RAJA/pattern/atomic.hpp b/include/RAJA/pattern/atomic.hpp
index d5905f7928..846a45fc94 100644
--- a/include/RAJA/pattern/atomic.hpp
+++ b/include/RAJA/pattern/atomic.hpp
@@ -86,26 +86,24 @@ namespace RAJA
  * @return Value at acc
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T *acc)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(T* acc)
 {
-  return RAJA::atomicLoad(Policy{}, acc);
+  return RAJA::atomicLoad(Policy {}, acc);
 }
 
-
 /*!
  * @brief Atomic store
  * @param acc Pointer to location of value
  * @param value Value to store at *acc
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T* acc, T value)
 {
-  RAJA::atomicStore(Policy{}, acc, value);
+  RAJA::atomicStore(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic add
  * @param acc Pointer to location of result value
@@ -113,13 +111,12 @@ RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T* acc, T value)
 {
-  return RAJA::atomicAdd(Policy{}, acc, value);
+  return RAJA::atomicAdd(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic subtract
  * @param acc Pointer to location of result value
@@ -127,13 +124,12 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T* acc, T value)
 {
-  return RAJA::atomicSub(Policy{}, acc, value);
+  return RAJA::atomicSub(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic minimum equivalent to (*acc) = std::min(*acc, value)
  * @param acc Pointer to location of result value
@@ -141,13 +137,12 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T* acc, T value)
 {
-  return RAJA::atomicMin(Policy{}, acc, value);
+  return RAJA::atomicMin(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic maximum equivalent to (*acc) = std::max(*acc, value)
  * @param acc Pointer to location of result value
@@ -155,26 +150,24 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(T* acc, T value)
 {
-  return RAJA::atomicMax(Policy{}, acc, value);
+  return RAJA::atomicMax(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic increment
  * @param acc Pointer to location of value to increment
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc)
 {
-  return RAJA::atomicInc(Policy{}, acc);
+  return RAJA::atomicInc(Policy {}, acc);
 }
 
-
 /*!
  * @brief Atomic increment with bound
  * Equivalent to *acc = ((*acc >= compare) ? 0 : ((*acc)+1))
@@ -184,26 +177,24 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T *acc, T compare)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(T* acc, T compare)
 {
-  return RAJA::atomicInc(Policy{}, acc, compare);
+  return RAJA::atomicInc(Policy {}, acc, compare);
 }
 
-
 /*!
  * @brief Atomic decrement
  * @param acc Pointer to location of value to decrement
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc)
 {
-  return RAJA::atomicDec(Policy{}, acc);
+  return RAJA::atomicDec(Policy {}, acc);
 }
 
-
 /*!
  * @brief Atomic decrement with bound
  * Equivalent to *acc = (((*acc==0)|(*acc>compare))?compare:((*acc)-1))
@@ -213,13 +204,12 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T* acc, T compare)
 {
-  return RAJA::atomicDec(Policy{}, acc, compare);
+  return RAJA::atomicDec(Policy {}, acc, compare);
 }
 
-
 /*!
  * @brief Atomic bitwise AND equivalent to (*acc) = (*acc) & value
  * This only works with integral data types
@@ -228,15 +218,14 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(T *acc, T compare)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicAnd can only be used on integral types");
-  return RAJA::atomicAnd(Policy{}, acc, value);
+  return RAJA::atomicAnd(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic bitwise OR equivalent to (*acc) = (*acc) | value
  * This only works with integral data types
@@ -245,15 +234,14 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicOr can only be used on integral types");
-  return RAJA::atomicOr(Policy{}, acc, value);
+  return RAJA::atomicOr(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic bitwise XOR equivalent to (*acc) = (*acc) ^ value
  * This only works with integral data types
@@ -262,15 +250,14 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(T *acc, T value)
  * @return Returns value at acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T* acc, T value)
 {
   static_assert(std::is_integral<T>::value,
                 "atomicXor can only be used on integral types");
-  return RAJA::atomicXor(Policy{}, acc, value);
+  return RAJA::atomicXor(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic value exchange
  * @param acc Pointer to location to store value
@@ -278,13 +265,12 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(T *acc, T value)
  * @return Returns value at *acc immediately before this operation completed
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T* acc, T value)
 {
-  return RAJA::atomicExchange(Policy{}, acc, value);
+  return RAJA::atomicExchange(Policy {}, acc, value);
 }
 
-
 /*!
  * @brief Atomic compare and swap
  * @param acc Pointer to location to store value
@@ -294,10 +280,10 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(T *acc, T value)
  */
 
 RAJA_SUPPRESS_HD_WARN
-template <typename Policy, typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
+template<typename Policy, typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T* acc, T compare, T value)
 {
-  return RAJA::atomicCAS(Policy{}, acc, compare, value);
+  return RAJA::atomicCAS(Policy {}, acc, compare, value);
 }
 
 /*!
@@ -309,32 +295,32 @@ RAJA_INLINE RAJA_HOST_DEVICE T atomicCAS(T *acc, T compare, T value)
  * This object provides an OO interface to the global function calls provided
  * as RAJA::atomicXXX
  */
-template <typename T, typename Policy = auto_atomic>
+template<typename T, typename Policy = auto_atomic>
 class AtomicRef
 {
 public:
   using value_type = T;
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr explicit AtomicRef(value_type *value_ptr)
-      : m_value_ptr(value_ptr) {}
+  constexpr explicit AtomicRef(value_type* value_ptr) : m_value_ptr(value_ptr)
+  {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr AtomicRef(AtomicRef const &c)
-      : m_value_ptr(c.m_value_ptr) {}
+  constexpr AtomicRef(AtomicRef const& c) : m_value_ptr(c.m_value_ptr) {}
 
   AtomicRef& operator=(AtomicRef const&) = delete;
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  value_type * getPointer() const
-  {
-    return m_value_ptr;
-  }
+  value_type* getPointer() const { return m_value_ptr; }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   void store(value_type rhs) const
   {
@@ -342,6 +328,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator=(value_type rhs) const
   {
@@ -350,20 +337,17 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  value_type load() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  value_type load() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  operator value_type() const
-  {
-    return RAJA::atomicLoad<Policy>(m_value_ptr);
-  }
+  operator value_type() const { return RAJA::atomicLoad<Policy>(m_value_ptr); }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type exchange(value_type rhs) const
   {
@@ -371,6 +355,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type CAS(value_type compare, value_type rhs) const
   {
@@ -378,20 +363,25 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   bool compare_exchange_strong(value_type& expect, value_type rhs) const
   {
     value_type compare = expect;
-    value_type old = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
-    if (compare == old) {
+    value_type old     = RAJA::atomicCAS<Policy>(m_value_ptr, compare, rhs);
+    if (compare == old)
+    {
       return true;
-    } else {
+    }
+    else
+    {
       expect = old;
       return false;
     }
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   bool compare_exchange_weak(value_type& expect, value_type rhs) const
   {
@@ -399,6 +389,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator++() const
   {
@@ -406,6 +397,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator++(int) const
   {
@@ -413,6 +405,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator--() const
   {
@@ -420,6 +413,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator--(int) const
   {
@@ -427,6 +421,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_add(value_type rhs) const
   {
@@ -434,6 +429,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator+=(value_type rhs) const
   {
@@ -441,6 +437,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_sub(value_type rhs) const
   {
@@ -448,6 +445,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator-=(value_type rhs) const
   {
@@ -455,6 +453,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_min(value_type rhs) const
   {
@@ -462,6 +461,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type min(value_type rhs) const
   {
@@ -470,6 +470,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_max(value_type rhs) const
   {
@@ -477,6 +478,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type max(value_type rhs) const
   {
@@ -485,6 +487,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_and(value_type rhs) const
   {
@@ -492,6 +495,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator&=(value_type rhs) const
   {
@@ -499,6 +503,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_or(value_type rhs) const
   {
@@ -506,6 +511,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator|=(value_type rhs) const
   {
@@ -513,6 +519,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type fetch_xor(value_type rhs) const
   {
@@ -520,6 +527,7 @@ class AtomicRef
   }
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   value_type operator^=(value_type rhs) const
   {
@@ -527,7 +535,7 @@ class AtomicRef
   }
 
 private:
-  value_type *m_value_ptr;
+  value_type* m_value_ptr;
 };
 
 
diff --git a/include/RAJA/pattern/detail/algorithm.hpp b/include/RAJA/pattern/detail/algorithm.hpp
index 21d266bd21..634cf70ce4 100644
--- a/include/RAJA/pattern/detail/algorithm.hpp
+++ b/include/RAJA/pattern/detail/algorithm.hpp
@@ -32,47 +32,45 @@ namespace RAJA
 namespace detail
 {
 
-template <typename Iter>
+template<typename Iter>
 using IterVal = typename ::std::iterator_traits<Iter>::value_type;
 
-template <typename Iter>
+template<typename Iter>
 using IterRef = typename ::std::iterator_traits<Iter>::reference;
 
-template <typename Iter>
+template<typename Iter>
 using IterDiff = typename ::std::iterator_traits<Iter>::difference_type;
 
-template <typename Container>
+template<typename Container>
 using ContainerIter = camp::iterator_from<Container>;
 
-template <typename Container>
+template<typename Container>
 using ContainerVal =
     camp::decay<decltype(*camp::val<camp::iterator_from<Container>>())>;
 
-template <typename Container>
-using ContainerRef =
-    decltype(*camp::val<camp::iterator_from<Container>>());
+template<typename Container>
+using ContainerRef = decltype(*camp::val<camp::iterator_from<Container>>());
 
-template <typename Container>
+template<typename Container>
 using ContainerDiff =
-    camp::decay<decltype(camp::val<camp::iterator_from<Container>>()-camp::val<camp::iterator_from<Container>>())>;
+    camp::decay<decltype(camp::val<camp::iterator_from<Container>>() -
+                         camp::val<camp::iterator_from<Container>>())>;
 
-template <typename DiffType, typename CountType>
-RAJA_INLINE
-DiffType firstIndex(DiffType n, CountType num_threads, CountType thread_id)
+template<typename DiffType, typename CountType>
+RAJA_INLINE DiffType firstIndex(DiffType n,
+                                CountType num_threads,
+                                CountType thread_id)
 {
   return (static_cast<size_t>(n) * thread_id) / num_threads;
 }
 
 }  // end namespace detail
 
-
 /*!
     \brief swap values at iterators lhs and rhs
 */
-template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-safe_iter_swap(Iter lhs, Iter rhs)
+template<typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE void safe_iter_swap(Iter lhs, Iter rhs)
 {
 #ifdef RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
   using camp::safe_swap;
@@ -86,10 +84,8 @@ safe_iter_swap(Iter lhs, Iter rhs)
 /*!
     \brief returns iterator to next item
 */
-template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-next(Iter it)
+template<typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE Iter next(Iter it)
 {
   ++it;
   return it;
@@ -98,10 +94,8 @@ next(Iter it)
 /*!
     \brief returns iterator to next item
 */
-template <typename Iter>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-prev(Iter it)
+template<typename Iter>
+RAJA_HOST_DEVICE RAJA_INLINE Iter prev(Iter it)
 {
   --it;
   return it;
diff --git a/include/RAJA/pattern/detail/forall.hpp b/include/RAJA/pattern/detail/forall.hpp
index 3bd5d7ecaf..aa9a3ac888 100644
--- a/include/RAJA/pattern/detail/forall.hpp
+++ b/include/RAJA/pattern/detail/forall.hpp
@@ -19,12 +19,12 @@
 #ifndef RAJA_PATTERN_DETAIL_FORALL_HPP
 #define RAJA_PATTERN_DETAIL_FORALL_HPP
 
-#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX) \
-  using std::begin;                                  \
-  using std::end;                                    \
-  using std::distance;                               \
-  auto begin##SUFFIX = begin(CONTAINER);             \
-  auto end##SUFFIX = end(CONTAINER);                 \
+#define RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, SUFFIX)                           \
+  using std::begin;                                                            \
+  using std::end;                                                              \
+  using std::distance;                                                         \
+  auto begin##SUFFIX    = begin(CONTAINER);                                    \
+  auto end##SUFFIX      = end(CONTAINER);                                      \
   auto distance##SUFFIX = distance(begin##SUFFIX, end##SUFFIX)
 
 #define RAJA_EXTRACT_BED_IT(CONTAINER) RAJA_EXTRACT_BED_SUFFIXED(CONTAINER, _it)
diff --git a/include/RAJA/pattern/detail/multi_reduce.hpp b/include/RAJA/pattern/detail/multi_reduce.hpp
index 884b9aa989..7e2cdabf7a 100644
--- a/include/RAJA/pattern/detail/multi_reduce.hpp
+++ b/include/RAJA/pattern/detail/multi_reduce.hpp
@@ -26,32 +26,29 @@
 #include "RAJA/util/RepeatView.hpp"
 
 
-#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)    \
-  template <typename tuning, typename T>                      \
-  struct MultiReduce##OP_NAME<POL<tuning>, T>                 \
-      : reduce::detail::BaseMultiReduce##OP_NAME<             \
-            DATA<T, RAJA::reduce::OP<T>, tuning>>             \
-  {                                                           \
-    using policy = POL<tuning>;                               \
-    using Base = reduce::detail::BaseMultiReduce##OP_NAME<    \
-        DATA<T, RAJA::reduce::OP<T>, tuning>>;                \
-    using Base::Base;                                         \
-    using typename Base::value_type;                          \
-    using typename Base::reference;                           \
-                                                              \
-    RAJA_SUPPRESS_HD_WARN                                     \
-    RAJA_HOST_DEVICE                                          \
-    reference operator[](size_t bin) const                    \
-    {                                                         \
-      return reference(*this, bin);                           \
-    }                                                         \
+#define RAJA_DECLARE_MULTI_REDUCER(OP_NAME, OP, POL, DATA)                     \
+  template<typename tuning, typename T>                                        \
+  struct MultiReduce##OP_NAME<POL<tuning>, T>                                  \
+      : reduce::detail::BaseMultiReduce##OP_NAME<                              \
+            DATA<T, RAJA::reduce::OP<T>, tuning>>                              \
+  {                                                                            \
+    using policy = POL<tuning>;                                                \
+    using Base   = reduce::detail::BaseMultiReduce##OP_NAME<                   \
+        DATA<T, RAJA::reduce::OP<T>, tuning>>;                               \
+    using Base::Base;                                                          \
+    using typename Base::value_type;                                           \
+    using typename Base::reference;                                            \
+                                                                               \
+    RAJA_SUPPRESS_HD_WARN                                                      \
+    RAJA_HOST_DEVICE                                                           \
+    reference operator[](size_t bin) const { return reference(*this, bin); }   \
   };
 
-#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)            \
-  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)             \
-  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)        \
+#define RAJA_DECLARE_ALL_MULTI_REDUCERS(POL, DATA)                             \
+  RAJA_DECLARE_MULTI_REDUCER(Sum, sum, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Min, min, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(Max, max, POL, DATA)                              \
+  RAJA_DECLARE_MULTI_REDUCER(BitOr, or_bit, POL, DATA)                         \
   RAJA_DECLARE_MULTI_REDUCER(BitAnd, and_bit, POL, DATA)
 
 namespace RAJA
@@ -63,36 +60,40 @@ namespace reduce
 namespace detail
 {
 
-template <typename t_MultiReduceData>
+template<typename t_MultiReduceData>
 struct BaseMultiReduce
 {
   using MultiReduceData = t_MultiReduceData;
-  using MultiReduceOp = typename t_MultiReduceData::MultiReduceOp;
-  using value_type = typename t_MultiReduceData::value_type;
+  using MultiReduceOp   = typename t_MultiReduceData::MultiReduceOp;
+  using value_type      = typename t_MultiReduceData::value_type;
 
-  BaseMultiReduce() : BaseMultiReduce{RepeatView<value_type>(MultiReduceOp::identity(), 0)} {}
+  BaseMultiReduce()
+      : BaseMultiReduce {RepeatView<value_type>(MultiReduceOp::identity(), 0)}
+  {}
 
   explicit BaseMultiReduce(size_t num_bins,
                            value_type init_val = MultiReduceOp::identity(),
                            value_type identity = MultiReduceOp::identity())
-      : BaseMultiReduce{RepeatView<value_type>(init_val, num_bins), identity}
-  { }
-
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>,
-                                   concepts::negate<std::is_convertible<Container, size_t>>,
-                                   concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* = nullptr >
+      : BaseMultiReduce {RepeatView<value_type>(init_val, num_bins), identity}
+  {}
+
+  template<typename Container,
+           concepts::enable_if_t<
+               type_traits::is_range<Container>,
+               concepts::negate<std::is_convertible<Container, size_t>>,
+               concepts::negate<std::is_base_of<BaseMultiReduce, Container>>>* =
+               nullptr>
   explicit BaseMultiReduce(Container const& container,
                            value_type identity = MultiReduceOp::identity())
-      : data{container, identity}
-  { }
+      : data {container, identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduce(BaseMultiReduce const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduce(BaseMultiReduce &&) = default;
-  BaseMultiReduce &operator=(BaseMultiReduce const&) = delete;
-  BaseMultiReduce &operator=(BaseMultiReduce &&) = delete;
+  BaseMultiReduce(BaseMultiReduce&&)                 = default;
+  BaseMultiReduce& operator=(BaseMultiReduce const&) = delete;
+  BaseMultiReduce& operator=(BaseMultiReduce&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduce() = default;
 
@@ -108,24 +109,27 @@ struct BaseMultiReduce
     reset(RepeatView<value_type>(init_val, num_bins), identity);
   }
 
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template<typename Container,
+           concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void reset(Container const& container,
              value_type identity = MultiReduceOp::identity())
   {
-    for (size_t bin = 0; bin < data.num_bins(); ++bin) {
-      RAJA_UNUSED_VAR(get(bin)); // automatic get() before reset
+    for (size_t bin = 0; bin < data.num_bins(); ++bin)
+    {
+      RAJA_UNUSED_VAR(get(bin));  // automatic get() before reset
     }
     data.reset(container, identity);
   }
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   size_t size() const { return data.num_bins(); }
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  BaseMultiReduce const& combine(size_t bin, value_type const &other) const
+  BaseMultiReduce const& combine(size_t bin, value_type const& other) const
   {
     data.combine(bin, other);
     return *this;
@@ -135,16 +139,19 @@ struct BaseMultiReduce
   value_type get(size_t bin) const { return data.get(bin); }
 
   //! Get the calculated reduced value for each bin and store it in container
-  template < typename Container,
-             concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr >
+  template<typename Container,
+           concepts::enable_if_t<type_traits::is_range<Container>>* = nullptr>
   void get_all(Container& container) const
   {
     RAJA_EXTRACT_BED_IT(container);
-    if (size_t(distance_it) != data.num_bins()) {
-      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size than multi reducer");
+    if (size_t(distance_it) != data.num_bins())
+    {
+      RAJA_ABORT_OR_THROW("MultiReduce::get_all container has different size "
+                          "than multi reducer");
     }
     size_t bin = 0;
-    for (auto& val : container) {
+    for (auto& val : container)
+    {
       val = data.get(bin);
       ++bin;
     }
@@ -154,7 +161,6 @@ struct BaseMultiReduce
   MultiReduceData mutable data;
 };
 
-
 /*!
  ******************************************************************************
  *
@@ -162,22 +168,22 @@ struct BaseMultiReduce
  *
  ******************************************************************************
  */
-template <typename MultiReduceData>
+template<typename MultiReduceData>
 class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
 {
 public:
   using Base = BaseMultiReduce<MultiReduceData>;
-  using typename Base::value_type;
   using Base::Base;
+  using typename Base::value_type;
 
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMin(BaseMultiReduceMin const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin(BaseMultiReduceMin &&) = default;
+  BaseMultiReduceMin(BaseMultiReduceMin&&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin const&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin const&) = delete;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMin &operator=(BaseMultiReduceMin &&) = delete;
+  BaseMultiReduceMin& operator=(BaseMultiReduceMin&&) = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMin() = default;
 
@@ -185,8 +191,9 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMin const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base),
+          m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -196,10 +203,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMin const& m_base;
@@ -214,7 +218,7 @@ class BaseMultiReduceMin : public BaseMultiReduce<MultiReduceData>
  *
  **************************************************************************
  */
-template <typename MultiReduceData>
+template<typename MultiReduceData>
 class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
 {
 public:
@@ -226,9 +230,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceMax(BaseMultiReduceMax const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceMax(BaseMultiReduceMax &&) = default;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax const&) = delete;
-  BaseMultiReduceMax &operator=(BaseMultiReduceMax &&) = delete;
+  BaseMultiReduceMax(BaseMultiReduceMax&&)                 = default;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax const&) = delete;
+  BaseMultiReduceMax& operator=(BaseMultiReduceMax&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceMax() = default;
 
@@ -236,8 +240,9 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceMax const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base),
+          m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -247,10 +252,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceMax const& m_base;
@@ -265,7 +267,7 @@ class BaseMultiReduceMax : public BaseMultiReduce<MultiReduceData>
  *
  **************************************************************************
  */
-template <typename MultiReduceData>
+template<typename MultiReduceData>
 class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
 {
 public:
@@ -277,9 +279,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceSum(BaseMultiReduceSum const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceSum(BaseMultiReduceSum &&) = default;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum const&) = delete;
-  BaseMultiReduceSum &operator=(BaseMultiReduceSum &&) = delete;
+  BaseMultiReduceSum(BaseMultiReduceSum&&)                 = default;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum const&) = delete;
+  BaseMultiReduceSum& operator=(BaseMultiReduceSum&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceSum() = default;
 
@@ -287,8 +289,9 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceSum const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base),
+          m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -298,10 +301,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceSum const& m_base;
@@ -316,7 +316,7 @@ class BaseMultiReduceSum : public BaseMultiReduce<MultiReduceData>
  *
  **************************************************************************
  */
-template <typename MultiReduceData>
+template<typename MultiReduceData>
 class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
 {
 public:
@@ -328,9 +328,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitOr(BaseMultiReduceBitOr const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitOr(BaseMultiReduceBitOr &&) = default;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr const&) = delete;
-  BaseMultiReduceBitOr &operator=(BaseMultiReduceBitOr &&) = delete;
+  BaseMultiReduceBitOr(BaseMultiReduceBitOr&&)                 = default;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr const&) = delete;
+  BaseMultiReduceBitOr& operator=(BaseMultiReduceBitOr&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitOr() = default;
 
@@ -338,8 +338,9 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitOr const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base),
+          m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -349,10 +350,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitOr const& m_base;
@@ -367,7 +365,7 @@ class BaseMultiReduceBitOr : public BaseMultiReduce<MultiReduceData>
  *
  **************************************************************************
  */
-template <typename MultiReduceData>
+template<typename MultiReduceData>
 class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
 {
 public:
@@ -379,9 +377,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   RAJA_SUPPRESS_HD_WARN
   BaseMultiReduceBitAnd(BaseMultiReduceBitAnd const&) = default;
   RAJA_SUPPRESS_HD_WARN
-  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd &&) = default;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd const&) = delete;
-  BaseMultiReduceBitAnd &operator=(BaseMultiReduceBitAnd &&) = delete;
+  BaseMultiReduceBitAnd(BaseMultiReduceBitAnd&&)                 = default;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd const&) = delete;
+  BaseMultiReduceBitAnd& operator=(BaseMultiReduceBitAnd&&)      = delete;
   RAJA_SUPPRESS_HD_WARN
   ~BaseMultiReduceBitAnd() = default;
 
@@ -389,8 +387,9 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
   {
     RAJA_HOST_DEVICE
     reference(BaseMultiReduceBitAnd const& base, size_t bin)
-      : m_base(base), m_bin(bin)
-    { }
+        : m_base(base),
+          m_bin(bin)
+    {}
 
     //! reducer function; updates the current instance's state
     RAJA_HOST_DEVICE
@@ -400,10 +399,7 @@ class BaseMultiReduceBitAnd : public BaseMultiReduce<MultiReduceData>
       return *this;
     }
 
-    value_type get() const
-    {
-      return m_base.get(m_bin);
-    }
+    value_type get() const { return m_base.get(m_bin); }
 
   private:
     BaseMultiReduceBitAnd const& m_base;
diff --git a/include/RAJA/pattern/detail/privatizer.hpp b/include/RAJA/pattern/detail/privatizer.hpp
index 3579027cd3..4d828e07f9 100644
--- a/include/RAJA/pattern/detail/privatizer.hpp
+++ b/include/RAJA/pattern/detail/privatizer.hpp
@@ -24,30 +24,30 @@ namespace internal
 // };
 // DefineTypeTraitFromConcept(has_privatizer, HasPrivatizer);
 
-template <typename T>
+template<typename T>
 class has_privatizer
 {
 private:
-  template <typename C>
+  template<typename C>
   static auto Test(void*)
-      -> decltype(camp::val<typename C::privatizer>(), camp::true_type{});
+      -> decltype(camp::val<typename C::privatizer>(), camp::true_type {});
 
-  template <typename>
+  template<typename>
   static camp::false_type Test(...);
 
 public:
   static bool const value = decltype(Test<T>(0))::value;
 };
 
-
 static_assert(!has_privatizer<int>::value, "if this fires, abandon all hope");
 
-struct GenericWrapperBase {
-};
+struct GenericWrapperBase
+{};
 
-template <typename T>
-struct Privatizer {
-  using value_type = camp::decay<T>;
+template<typename T>
+struct Privatizer
+{
+  using value_type     = camp::decay<T>;
   using reference_type = value_type&;
   value_type priv;
   static_assert(!has_privatizer<T>::value,
@@ -58,7 +58,7 @@ struct Privatizer {
                 "a bug");
 
   RAJA_SUPPRESS_HD_WARN
-  RAJA_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
+  RAJA_HOST_DEVICE Privatizer(const T& o) : priv {o} {}
 
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE reference_type get_priv() { return priv; }
@@ -81,19 +81,19 @@ struct Privatizer {
  * that does not belong here.
  *
  */
-template <typename T,
-          typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
+template<typename T,
+         typename std::enable_if<!has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
 {
-  return Privatizer<T>{item};
+  return Privatizer<T> {item};
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T,
-          typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
+template<typename T,
+         typename std::enable_if<has_privatizer<T>::value>::type* = nullptr>
 RAJA_HOST_DEVICE auto thread_privatize(const T& item) -> typename T::privatizer
 {
-  return typename T::privatizer{item};
+  return typename T::privatizer {item};
 }
 
 }  // namespace internal
diff --git a/include/RAJA/pattern/detail/reduce.hpp b/include/RAJA/pattern/detail/reduce.hpp
index 788f3c698d..fb49658c2a 100644
--- a/include/RAJA/pattern/detail/reduce.hpp
+++ b/include/RAJA/pattern/detail/reduce.hpp
@@ -21,33 +21,33 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/types.hpp"
 
-#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)               \
-  template <typename T>                                       \
-  class Reduce##OP<POL, T>                                    \
-      : public reduce::detail::BaseReduce##OP<T, COMBINER>    \
-  {                                                           \
-  public:                                                     \
-    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>; \
-    using Base::Base;                                         \
+#define RAJA_DECLARE_REDUCER(OP, POL, COMBINER)                                \
+  template<typename T>                                                         \
+  class Reduce##OP<POL, T>                                                     \
+      : public reduce::detail::BaseReduce##OP<T, COMBINER>                     \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, COMBINER>;                  \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                    \
-  template <typename T, typename IndexType>                              \
-  class Reduce##OP<POL, T, IndexType>                                    \
-      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>    \
-  {                                                                      \
-  public:                                                                \
-    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>; \
-    using Base::Base;                                                    \
+#define RAJA_DECLARE_INDEX_REDUCER(OP, POL, COMBINER)                          \
+  template<typename T, typename IndexType>                                     \
+  class Reduce##OP<POL, T, IndexType>                                          \
+      : public reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>          \
+  {                                                                            \
+  public:                                                                      \
+    using Base = reduce::detail::BaseReduce##OP<T, IndexType, COMBINER>;       \
+    using Base::Base;                                                          \
   };
 
-#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)       \
-  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)             \
-  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)             \
-  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)    \
-  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)    \
-  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)           \
+#define RAJA_DECLARE_ALL_REDUCERS(POL, COMBINER)                               \
+  RAJA_DECLARE_REDUCER(Sum, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Min, POL, COMBINER)                                     \
+  RAJA_DECLARE_REDUCER(Max, POL, COMBINER)                                     \
+  RAJA_DECLARE_INDEX_REDUCER(MinLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_INDEX_REDUCER(MaxLoc, POL, COMBINER)                            \
+  RAJA_DECLARE_REDUCER(BitOr, POL, COMBINER)                                   \
   RAJA_DECLARE_REDUCER(BitAnd, POL, COMBINER)
 
 namespace RAJA
@@ -63,40 +63,42 @@ namespace reduce
 namespace detail
 {
 
-template <typename T, template <typename...> class Op>
-struct op_adapter : private Op<T, T, T> {
+template<typename T, template<typename...> class Op>
+struct op_adapter : private Op<T, T, T>
+{
   using operator_type = Op<T, T, T>;
+
   RAJA_HOST_DEVICE static constexpr T identity()
   {
     return operator_type::identity();
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val, const T v) const
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val, const T v) const
   {
     val = operator_type::operator()(val, v);
   }
 };
 }  // namespace detail
 
-template <typename T>
-struct sum : detail::op_adapter<T, RAJA::operators::plus> {
-};
+template<typename T>
+struct sum : detail::op_adapter<T, RAJA::operators::plus>
+{};
 
-template <typename T>
-struct min : detail::op_adapter<T, RAJA::operators::minimum> {
-};
+template<typename T>
+struct min : detail::op_adapter<T, RAJA::operators::minimum>
+{};
 
-template <typename T>
-struct max : detail::op_adapter<T, RAJA::operators::maximum> {
-};
+template<typename T>
+struct max : detail::op_adapter<T, RAJA::operators::maximum>
+{};
 
-template <typename T>
-struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or> {
-};
+template<typename T>
+struct or_bit : detail::op_adapter<T, RAJA::operators::bit_or>
+{};
 
-template <typename T>
-struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
-};
+template<typename T>
+struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and>
+{};
 
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -106,52 +108,71 @@ struct and_bit : detail::op_adapter<T, RAJA::operators::bit_and> {
 namespace detail
 {
 
-template <typename T, bool = std::is_integral<T>::value>
-struct DefaultLoc {};
+template<typename T, bool = std::is_integral<T>::value>
+struct DefaultLoc
+{};
 
-template <typename T>
+template<typename T>
 struct DefaultLoc<T, false>  // any non-integral type
 {
   RAJA_HOST_DEVICE constexpr T value() const { return T(); }
 };
 
-template <typename T>
+template<typename T>
 struct DefaultLoc<T, true>
 {
   RAJA_HOST_DEVICE constexpr T value() const { return -1; }
 };
 
-template <typename T, typename IndexType, bool doing_min = true>
+template<typename T, typename IndexType, bool doing_min = true>
 class ValueLoc
 {
 public:
   T val = doing_min ? operators::limits<T>::max() : operators::limits<T>::min();
   IndexType loc = DefaultLoc<IndexType>().value();
 
-#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 || defined(__HIPCC__)
+#if __NVCC__ && defined(CUDART_VERSION) && CUDART_VERSION < 9020 ||            \
+    defined(__HIPCC__)
   RAJA_HOST_DEVICE constexpr ValueLoc() {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const &other) : val{other.val}, loc{other.loc} {}
+
+  RAJA_HOST_DEVICE constexpr ValueLoc(ValueLoc const& other)
+      : val {other.val},
+        loc {other.loc}
+  {}
+
   RAJA_HOST_DEVICE
-  ValueLoc &operator=(ValueLoc const &other) { val = other.val; loc = other.loc; return *this;}
+  ValueLoc& operator=(ValueLoc const& other)
+  {
+    val = other.val;
+    loc = other.loc;
+    return *this;
+  }
 #else
-  constexpr ValueLoc() = default;
-  constexpr ValueLoc(ValueLoc const &) = default;
-  ValueLoc &operator=(ValueLoc const &) = default;
+  constexpr ValueLoc()                 = default;
+  constexpr ValueLoc(ValueLoc const&)  = default;
+  ValueLoc& operator=(ValueLoc const&) = default;
 #endif
 
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_) : val{val_}, loc{DefaultLoc<IndexType>().value()} {}
-  RAJA_HOST_DEVICE constexpr ValueLoc(T const &val_, IndexType const &loc_)
-      : val{val_}, loc{loc_}
-  {
-  }
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_)
+      : val {val_},
+        loc {DefaultLoc<IndexType>().value()}
+  {}
+
+  RAJA_HOST_DEVICE constexpr ValueLoc(T const& val_, IndexType const& loc_)
+      : val {val_},
+        loc {loc_}
+  {}
 
   RAJA_HOST_DEVICE operator T() const { return val; }
+
   RAJA_HOST_DEVICE IndexType getLoc() { return loc; }
-  RAJA_HOST_DEVICE bool operator<(ValueLoc const &rhs) const
+
+  RAJA_HOST_DEVICE bool operator<(ValueLoc const& rhs) const
   {
     return val < rhs.val;
   }
-  RAJA_HOST_DEVICE bool operator>(ValueLoc const &rhs) const
+
+  RAJA_HOST_DEVICE bool operator>(ValueLoc const& rhs) const
   {
     return val > rhs.val;
   }
@@ -163,15 +184,19 @@ class ValueLoc
 
 namespace operators
 {
-template <typename T, typename IndexType, bool B>
-struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> min()
+template<typename T, typename IndexType, bool B>
+struct limits<::RAJA::reduce::detail::ValueLoc<T, IndexType, B>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      min()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr
-  ::RAJA::reduce::detail::ValueLoc<T, IndexType, B> max()
+
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr ::RAJA::reduce::detail::
+      ValueLoc<T, IndexType, B>
+      max()
   {
     return ::RAJA::reduce::detail::ValueLoc<T, IndexType, B>(limits<T>::max());
   }
@@ -184,11 +209,11 @@ namespace reduce
 namespace detail
 {
 
-template <typename T,
-          template <typename>
-          class Reduce_,
-          template <typename, typename>
-          class Combiner_>
+template<typename T,
+         template<typename>
+         class Reduce_,
+         template<typename, typename>
+         class Combiner_>
 class BaseReduce
 {
   using Reduce = Reduce_<T>;
@@ -197,50 +222,55 @@ class BaseReduce
   Combiner_t mutable c;
 
 public:
-  using value_type = T;
+  using value_type  = T;
   using reduce_type = Reduce;
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  BaseReduce() : c{T(), Reduce::identity()} {}
+  BaseReduce() : c {T(), Reduce::identity()} {}
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   BaseReduce(T init_val, T identity_ = Reduce::identity())
-      : c{init_val, identity_}
-  {
-  }
+      : c {init_val, identity_}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   void reset(T val, T identity_ = Reduce::identity())
   {
-    operator T(); // automatic get() before reset
+    operator T();  // automatic get() before reset
     c.reset(val, identity_);
   }
 
   //! prohibit compiler-generated copy assignment
-  BaseReduce &operator=(const BaseReduce &) = delete;
+  BaseReduce& operator=(const BaseReduce&) = delete;
 
   //! compiler-generated copy constructor
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  BaseReduce(const BaseReduce &copy) : c(copy.c) {}
+  BaseReduce(const BaseReduce& copy) : c(copy.c) {}
 
   //! compiler-generated move constructor
   RAJA_SUPPRESS_HD_WARN
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  BaseReduce(BaseReduce &&copy) : c(std::move(copy.c)) {}
+  BaseReduce(BaseReduce&& copy) : c(std::move(copy.c)) {}
 
   //! compiler-generated move assignment
-  BaseReduce &operator=(BaseReduce &&) = default;
+  BaseReduce& operator=(BaseReduce&&) = default;
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  void combine(T const &other) const { c.combine(other); }
+  void combine(T const& other) const { c.combine(other); }
 
-  T &local() const { return c.local(); }
+  T& local() const { return c.local(); }
 
   //! Get the calculated reduced value
   operator T() const { return c.get(); }
@@ -249,55 +279,61 @@ class BaseReduce
   T get() const { return c.get(); }
 };
 
-template <typename T, typename Reduce, typename Derived>
+template<typename T, typename Reduce, typename Derived>
 class BaseCombinable
 {
 protected:
-  BaseCombinable const *parent = nullptr;
+  BaseCombinable const* parent = nullptr;
   T identity;
   T mutable my_data;
 
 public:
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable() : identity{T()}, my_data{T()} {}
+  constexpr BaseCombinable() : identity {T()}, my_data {T()} {}
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   constexpr BaseCombinable(T init_val, T identity_ = T())
-      : identity{identity_}, my_data{init_val}
-  {
-  }
+      : identity {identity_},
+        my_data {init_val}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   void reset(T init_val, T identity_)
   {
-    my_data = init_val;
+    my_data  = init_val;
     identity = identity_;
   }
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  constexpr BaseCombinable(BaseCombinable const &other)
-      : parent{other.parent ? other.parent : &other},
-        identity{other.identity},
-        my_data{identity}
-  {
-  }
+  constexpr BaseCombinable(BaseCombinable const& other)
+      : parent {other.parent ? other.parent : &other},
+        identity {other.identity},
+        my_data {identity}
+  {}
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
   ~BaseCombinable()
   {
-    if (parent && my_data != identity) {
+    if (parent && my_data != identity)
+    {
       Reduce()(parent->my_data, my_data);
     }
   }
 
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  void combine(T const &other) { Reduce{}(my_data, other); }
+  void combine(T const& other) { Reduce {}(my_data, other); }
 
   /*!
    *  \return the calculated reduced value
@@ -307,17 +343,18 @@ class BaseCombinable
   /*!
    *  \return reference to the local value
    */
-  T &local() const { return my_data; }
+  T& local() const { return my_data; }
 
   T get_combined() const { return my_data; }
 
 private:
   // Convenience method for CRTP
-  const Derived &derived() const
+  const Derived& derived() const
   {
-    return *(static_cast<const Derived *>(this));
+    return *(static_cast<const Derived*>(this));
   }
-  Derived &derived() { return *(static_cast<Derived *>(this)); }
+
+  Derived& derived() { return *(static_cast<Derived*>(this)); }
 };
 
 /*!
@@ -327,7 +364,7 @@ class BaseCombinable
  *
  ******************************************************************************
  */
-template <typename T, template <typename, typename> class Combiner>
+template<typename T, template<typename, typename> class Combiner>
 class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 {
 public:
@@ -336,7 +373,7 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMin &min(T rhs) const
+  const BaseReduceMin& min(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -350,36 +387,43 @@ class BaseReduceMin : public BaseReduce<T, RAJA::reduce::min, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
+template<typename T,
+         typename IndexType,
+         template<typename, typename>
+         class Combiner>
 class BaseReduceMinLoc
     : public BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>
 {
 public:
   using Base = BaseReduce<ValueLoc<T, IndexType>, RAJA::reduce::min, Combiner>;
-  using value_type = typename Base::value_type;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMinLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMinLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMinLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   /// \brief reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMinLoc &minloc(T rhs, IndexType loc) const
+  const BaseReduceMinLoc& minloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
@@ -399,7 +443,7 @@ class BaseReduceMinLoc
  *
  **************************************************************************
  */
-template <typename T, template <typename, typename> class Combiner>
+template<typename T, template<typename, typename> class Combiner>
 class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 {
 public:
@@ -408,7 +452,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMax &max(T rhs) const
+  const BaseReduceMax& max(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -422,7 +466,7 @@ class BaseReduceMax : public BaseReduce<T, RAJA::reduce::max, Combiner>
  *
  **************************************************************************
  */
-template <typename T, template <typename, typename> class Combiner>
+template<typename T, template<typename, typename> class Combiner>
 class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
 {
 public:
@@ -431,8 +475,9 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  const BaseReduceSum &operator+=(T rhs) const
+  const BaseReduceSum& operator+=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -446,7 +491,7 @@ class BaseReduceSum : public BaseReduce<T, RAJA::reduce::sum, Combiner>
  *
  **************************************************************************
  */
-template <typename T, template <typename, typename> class Combiner>
+template<typename T, template<typename, typename> class Combiner>
 class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
 {
 public:
@@ -455,8 +500,9 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  const BaseReduceBitOr &operator|=(T rhs) const
+  const BaseReduceBitOr& operator|=(T rhs) const
   {
     this->combine(rhs);
     return *this;
@@ -470,7 +516,7 @@ class BaseReduceBitOr : public BaseReduce<T, RAJA::reduce::or_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, template <typename, typename> class Combiner>
+template<typename T, template<typename, typename> class Combiner>
 class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
 {
 public:
@@ -479,15 +525,15 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
 
   //! reducer function; updates the current instance's state
   RAJA_SUPPRESS_HD_WARN
+
   RAJA_HOST_DEVICE
-  const BaseReduceBitAnd &operator&=(T rhs) const
+  const BaseReduceBitAnd& operator&=(T rhs) const
   {
     this->combine(rhs);
     return *this;
   }
 };
 
-
 /*!
  **************************************************************************
  *
@@ -495,36 +541,45 @@ class BaseReduceBitAnd : public BaseReduce<T, RAJA::reduce::and_bit, Combiner>
  *
  **************************************************************************
  */
-template <typename T, typename IndexType, template <typename, typename> class Combiner>
-class BaseReduceMaxLoc
-    : public BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>
+template<typename T,
+         typename IndexType,
+         template<typename, typename>
+         class Combiner>
+class BaseReduceMaxLoc : public BaseReduce<ValueLoc<T, IndexType, false>,
+                                           RAJA::reduce::max,
+                                           Combiner>
 {
 public:
-  using Base = BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
-  using value_type = typename Base::value_type;
+  using Base =
+      BaseReduce<ValueLoc<T, IndexType, false>, RAJA::reduce::max, Combiner>;
+  using value_type  = typename Base::value_type;
   using reduce_type = typename Base::reduce_type;
   using Base::Base;
 
   constexpr BaseReduceMaxLoc() : Base(value_type(T(), IndexType())) {}
 
-  constexpr BaseReduceMaxLoc(T init_val, IndexType init_idx,
-                             T identity_val_ = reduce_type::identity(),
-                             IndexType identity_loc_ = DefaultLoc<IndexType>().value())
-    : Base(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_))
-  {
-  }
-
-  void reset(T init_val, IndexType init_idx,
-             T identity_val_ = reduce_type::identity(),
+  constexpr BaseReduceMaxLoc(
+      T init_val,
+      IndexType init_idx,
+      T identity_val_         = reduce_type::identity(),
+      IndexType identity_loc_ = DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val_, identity_loc_))
+  {}
+
+  void reset(T init_val,
+             IndexType init_idx,
+             T identity_val_         = reduce_type::identity(),
              IndexType identity_loc_ = DefaultLoc<IndexType>().value())
   {
-    operator T(); // automatic get() before reset
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val_, identity_loc_));
+    operator T();  // automatic get() before reset
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val_, identity_loc_));
   }
 
   //! reducer function; updates the current instance's state
   RAJA_HOST_DEVICE
-  const BaseReduceMaxLoc &maxloc(T rhs, IndexType loc) const
+  const BaseReduceMaxLoc& maxloc(T rhs, IndexType loc) const
   {
     this->combine(value_type(rhs, loc));
     return *this;
diff --git a/include/RAJA/pattern/forall.hpp b/include/RAJA/pattern/forall.hpp
index 686f0e8c6b..9ca1046f93 100644
--- a/include/RAJA/pattern/forall.hpp
+++ b/include/RAJA/pattern/forall.hpp
@@ -97,38 +97,59 @@ namespace RAJA
 namespace detail
 {
 /// Adapter to replace specific implementations for the icount variants
-template <typename Range, typename Body, typename IndexT>
-struct icount_adapter {
+template<typename Range, typename Body, typename IndexT>
+struct icount_adapter
+{
   using index_type = typename std::decay<IndexT>::type;
   typename std::decay<Body>::type body;
   using container_type = typename std::decay<Range>::type;
   typename container_type::iterator begin_it;
   Index_type icount;
+
   icount_adapter(Range const& r, Body const& b, IndexT icount_)
-      : body{b}, icount{icount_}
+      : body {b},
+        icount {icount_}
   {
     using std::begin;
     begin_it = begin(r);
   }
 
   RAJA_SUPPRESS_HD_WARN
-  template <typename T>
+  template<typename T>
   RAJA_HOST_DEVICE void operator()(T const& i) const
   {
     body(static_cast<index_type>(i + icount), begin_it[i]);
   }
 };
 
-struct CallForall {
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+struct CallForall
+{
+  template<typename T,
+           typename ExecPol,
+           typename Body,
+           typename Res,
+           typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&,
+                                                          ExecPol,
+                                                          Body,
+                                                          Res,
+                                                          ForallParams) const;
 };
 
-struct CallForallIcount {
+struct CallForallIcount
+{
   constexpr CallForallIcount(int s);
 
-  template <typename T, typename ExecPol, typename Body, typename Res, typename ForallParams>
-  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&, ExecPol, Body, Res, ForallParams) const;
+  template<typename T,
+           typename ExecPol,
+           typename Body,
+           typename Res,
+           typename ForallParams>
+  RAJA_INLINE camp::resources::EventProxy<Res> operator()(T const&,
+                                                          ExecPol,
+                                                          Body,
+                                                          Res,
+                                                          ForallParams) const;
 
   const int start;
 };
@@ -152,22 +173,31 @@ namespace wrap
  *
  ******************************************************************************
  */
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody, typename ForallParams>
+template<typename Res,
+         typename ExecutionPolicy,
+         typename Container,
+         typename LoopBody,
+         typename ForallParams>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
     type_traits::is_range<Container>>
-forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body, ForallParams&& f_params)
+forall(Res r,
+       ExecutionPolicy&& p,
+       Container&& c,
+       LoopBody&& loop_body,
+       ForallParams&& f_params)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     std::forward<ForallParams>(f_params));
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), std::forward<ForallParams>(f_params));
 }
 
-template <typename Res, typename ExecutionPolicy, typename Container, typename LoopBody>
+template<typename Res,
+         typename ExecutionPolicy,
+         typename Container,
+         typename LoopBody>
 RAJA_INLINE concepts::enable_if_t<
     RAJA::resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -175,14 +205,11 @@ RAJA_INLINE concepts::enable_if_t<
 forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r,
-                     std::forward<ExecutionPolicy>(p),
-                     std::forward<Container>(c),
-                     std::forward<LoopBody>(loop_body),
-                     expt::get_empty_forall_param_pack());
+  return forall_impl(
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c),
+      std::forward<LoopBody>(loop_body), expt::get_empty_forall_param_pack());
 }
 
-
 /*!
  ******************************************************************************
  *
@@ -190,29 +217,29 @@ forall(Res r, ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename Res,
-          typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename ForallParams>
+template<typename Res,
+         typename ExecutionPolicy,
+         typename Container,
+         typename IndexType,
+         typename LoopBody,
+         typename ForallParams>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                      ExecutionPolicy&& p,
-                                                      Container&& c,
-                                                      IndexType&& icount,
-                                                      LoopBody&& loop_body,
-                                                      ForallParams&& f_params)
+                                                     ExecutionPolicy&& p,
+                                                     Container&& c,
+                                                     IndexType&& icount,
+                                                     LoopBody&& loop_body,
+                                                     ForallParams&& f_params)
 {
   using std::begin;
   using std::distance;
   using std::end;
   auto range = RangeSegment(0, distance(begin(c), end(c)));
-  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c,
-                                                                 loop_body,
+  detail::icount_adapter<Container, LoopBody, IndexType> adapted(c, loop_body,
                                                                  icount);
   using policy::sequential::forall_impl;
   RAJA_FORCEINLINE_RECURSIVE
-  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted, std::forward<ForallParams>(f_params));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p), range, adapted,
+                     std::forward<ForallParams>(f_params));
 }
 
 /*!
@@ -224,62 +251,60 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
 *
 ******************************************************************************
 */
-template <typename Res,
-          typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename... SegmentTypes,
-          typename LoopBody,
-          typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall_Icount(Res r,
-                                                ExecPolicy<SegmentIterPolicy,
-                                                SegmentExecPolicy>,
-                                                const TypedIndexSet<SegmentTypes...>& iset,
-                                                LoopBody loop_body,
-                                                ForallParams f_params)
+template<typename Res,
+         typename SegmentIterPolicy,
+         typename SegmentExecPolicy,
+         typename... SegmentTypes,
+         typename LoopBody,
+         typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall_Icount(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody loop_body,
+    ForallParams f_params)
 {
   // no need for icount variant here
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
     iset.segmentCall(segID,
                      detail::CallForallIcount(iset.getStartingIcount(segID)),
-                     SegmentExecPolicy(),
-                     loop_body,
-                     r,
-                     f_params);
+                     SegmentExecPolicy(), loop_body, r, f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
-template <typename Res,
-          typename SegmentIterPolicy,
-          typename SegmentExecPolicy,
-          typename LoopBody,
-          typename... SegmentTypes,
-          typename ForallParams>
-RAJA_INLINE resources::EventProxy<Res> forall(Res r,
-                                         ExecPolicy<SegmentIterPolicy,
-                                         SegmentExecPolicy>,
-                                         const TypedIndexSet<SegmentTypes...>& iset,
-                                         LoopBody loop_body,
-                                         ForallParams f_params)
-{
-  auto segIterRes = resources::get_resource<SegmentIterPolicy>::type::get_default();
+template<typename Res,
+         typename SegmentIterPolicy,
+         typename SegmentExecPolicy,
+         typename LoopBody,
+         typename... SegmentTypes,
+         typename ForallParams>
+RAJA_INLINE resources::EventProxy<Res> forall(
+    Res r,
+    ExecPolicy<SegmentIterPolicy, SegmentExecPolicy>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody loop_body,
+    ForallParams f_params)
+{
+  auto segIterRes =
+      resources::get_resource<SegmentIterPolicy>::type::get_default();
   wrap::forall(segIterRes, SegmentIterPolicy(), iset, [=, &r](int segID) {
-    iset.segmentCall(segID, detail::CallForall{}, SegmentExecPolicy(), loop_body, r, f_params);
+    iset.segmentCall(segID, detail::CallForall {}, SegmentExecPolicy(),
+                     loop_body, r, f_params);
   });
   return RAJA::resources::EventProxy<Res>(r);
 }
 
 }  // end namespace wrap
 
-
-
 /*!
  ******************************************************************************
  *
- * \brief The RAJA::policy_by_value_interface forall functions provide an interface with
- *        value-based policies. It also enforces the interface and performs
- *        static checks as well as triggering plugins and loop body updates.
+ * \brief The RAJA::policy_by_value_interface forall functions provide an
+ *interface with value-based policies. It also enforces the interface and
+ *performs static checks as well as triggering plugins and loop body updates.
  *
  ******************************************************************************
  */
@@ -294,7 +319,10 @@ inline namespace policy_by_value_interface
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+template<typename ExecutionPolicy,
+         typename Res,
+         typename IdxSet,
+         typename... Params>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
                                                      Res r,
                                                      IdxSet&& c,
@@ -306,9 +334,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   auto f_params = expt::make_forall_param_pack(std::forward<Params>(params)...);
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -318,27 +347,25 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
 
   util::callPreLaunchPlugins(context);
 
-  RAJA::resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  RAJA::resources::EventProxy<Res> e =
+      wrap::forall_Icount(r, std::forward<ExecutionPolicy>(p),
+                          std::forward<IdxSet>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+
+template<typename ExecutionPolicy,
+         typename IdxSet,
+         typename LoopBody,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
                                                      IdxSet&& c,
                                                      LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -349,7 +376,10 @@ RAJA_INLINE resources::EventProxy<Res> forall_Icount(ExecutionPolicy&& p,
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Res, typename IdxSet, typename... Params>
+template<typename ExecutionPolicy,
+         typename Res,
+         typename IdxSet,
+         typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_indexset_policy<ExecutionPolicy>>
@@ -363,7 +393,8 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -373,18 +404,18 @@ forall(ExecutionPolicy&& p, Res r, IdxSet&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e = wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<IdxSet>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p), std::forward<IdxSet>(c),
+                   std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy, typename IdxSet, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+
+template<typename ExecutionPolicy,
+         typename IdxSet,
+         typename LoopBody,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_indexset_policy<ExecutionPolicy>>
@@ -392,9 +423,7 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<IdxSet>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<IdxSet>(c),
       std::forward<LoopBody>(loop_body));
 }
 
@@ -405,12 +434,13 @@ forall(ExecutionPolicy&& p, IdxSet&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_multi_policy<ExecutionPolicy>,
-    type_traits::is_range<Container>>
+template<typename ExecutionPolicy,
+         typename Container,
+         typename LoopBody,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_multi_policy<ExecutionPolicy>,
+                                  type_traits::is_range<Container>>
 forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   static_assert(type_traits::is_random_access_range<Container>::value,
@@ -419,10 +449,9 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
   auto r = Res::get_default();
 
   // plugins handled in multipolicy policy_invoker
-  return forall_impl(r,
-              std::forward<ExecutionPolicy>(p),
-              std::forward<Container>(c),
-              std::forward<LoopBody>(loop_body));
+  return forall_impl(r, std::forward<ExecutionPolicy>(p),
+                     std::forward<Container>(c),
+                     std::forward<LoopBody>(loop_body));
 }
 
 /*!
@@ -432,16 +461,15 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
  *
  ******************************************************************************
  */
-template <typename ExecutionPolicy,
-          typename Res,
-          typename Container,
-          typename IndexType,
-          typename FirstParam,
-          typename... Params>
-RAJA_INLINE concepts::enable_if_t<
-    resources::EventProxy<Res>,
-    type_traits::is_range<Container>,
-    type_traits::is_integral<IndexType>>
+template<typename ExecutionPolicy,
+         typename Res,
+         typename Container,
+         typename IndexType,
+         typename FirstParam,
+         typename... Params>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_range<Container>,
+                                  type_traits::is_integral<IndexType>>
 forall_Icount(ExecutionPolicy&& p,
               Res r,
               Container&& c,
@@ -452,11 +480,14 @@ forall_Icount(ExecutionPolicy&& p,
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container does not model RandomAccessIterator");
 
-  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first), std::forward<Params>(params)...);
-  //expt::check_forall_optional_args(loop_body, f_params);
+  auto f_params = expt::make_forall_param_pack(std::forward<FirstParam>(first),
+                                               std::forward<Params>(params)...);
+  auto&& loop_body = expt::get_lambda(std::forward<FirstParam>(first),
+                                      std::forward<Params>(params)...);
+  // expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -467,21 +498,18 @@ forall_Icount(ExecutionPolicy&& p,
   util::callPreLaunchPlugins(context);
 
   resources::EventProxy<Res> e = wrap::forall_Icount(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      icount,
-      std::move(body),
-      f_params);
+      r, std::forward<ExecutionPolicy>(p), std::forward<Container>(c), icount,
+      std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
-template <typename ExecutionPolicy,
-          typename Container,
-          typename IndexType,
-          typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+
+template<typename ExecutionPolicy,
+         typename Container,
+         typename IndexType,
+         typename LoopBody,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     type_traits::is_range<Container>,
@@ -494,10 +522,7 @@ forall_Icount(ExecutionPolicy&& p,
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      icount,
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c), icount,
       std::forward<LoopBody>(loop_body));
 }
 
@@ -509,7 +534,10 @@ forall_Icount(ExecutionPolicy&& p,
  ******************************************************************************
  */
 
-template <typename ExecutionPolicy, typename Res, typename Container, typename... Params>
+template<typename ExecutionPolicy,
+         typename Res,
+         typename Container,
+         typename... Params>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -524,7 +552,8 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
   auto&& loop_body = expt::get_lambda(std::forward<Params>(params)...);
   expt::check_forall_optional_args(loop_body, f_params);
 
-  util::PluginContext context{util::make_context<camp::decay<ExecutionPolicy>>()};
+  util::PluginContext context {
+      util::make_context<camp::decay<ExecutionPolicy>>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -534,19 +563,18 @@ forall(ExecutionPolicy&& p, Res r, Container&& c, Params&&... params)
 
   util::callPreLaunchPlugins(context);
 
-  resources::EventProxy<Res> e =  wrap::forall(
-      r,
-      std::forward<ExecutionPolicy>(p),
-      std::forward<Container>(c),
-      std::move(body),
-      f_params);
+  resources::EventProxy<Res> e =
+      wrap::forall(r, std::forward<ExecutionPolicy>(p),
+                   std::forward<Container>(c), std::move(body), f_params);
 
   util::callPostLaunchPlugins(context);
   return e;
 }
 
-template <typename ExecutionPolicy, typename Container, typename LoopBody,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template<typename ExecutionPolicy,
+         typename Container,
+         typename LoopBody,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE concepts::enable_if_t<
     resources::EventProxy<Res>,
     concepts::negate<type_traits::is_indexset_policy<ExecutionPolicy>>,
@@ -556,34 +584,34 @@ forall(ExecutionPolicy&& p, Container&& c, LoopBody&& loop_body)
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall(
-      std::forward<ExecutionPolicy>(p),
-      r,
-      std::forward<Container>(c),
+      std::forward<ExecutionPolicy>(p), r, std::forward<Container>(c),
       std::forward<LoopBody>(loop_body));
 }
 
-}  // end inline namespace policy_by_value_interface
-
+}  // namespace policy_by_value_interface
 
 /*!
  * \brief Conversion from template-based policy to value-based policy for forall
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template<typename ExecutionPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall(Args&&... args)
 {
   Res r = Res::get_default();
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
-template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+
+template<typename ExecutionPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::forall(
-      ExecutionPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::forall(ExecutionPolicy(), r,
+                                                   std::forward<Args>(args)...);
 }
 
 /*!
@@ -592,16 +620,19 @@ forall(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecutionPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecutionPolicy>::type >
+template<typename ExecutionPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecutionPolicy>::type>
 RAJA_INLINE resources::EventProxy<Res> forall_Icount(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::forall_Icount(
       ExecutionPolicy(), r, std::forward<Args>(args)...);
 }
-template <typename ExecutionPolicy, typename Res, typename... Args>
-RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>, type_traits::is_resource<Res>>
+
+template<typename ExecutionPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_resource<Res>>
 forall_Icount(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::forall_Icount(
@@ -611,12 +642,17 @@ forall_Icount(Res r, Args&&... args)
 namespace detail
 {
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& segment,
-                                                               ExecutionPolicy,
-                                                               LoopBody body,
-                                                               Res r,
-                                                               ForallParams f_params) const
+template<typename T,
+         typename ExecutionPolicy,
+         typename LoopBody,
+         typename Res,
+         typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody body,
+    Res r,
+    ForallParams f_params) const
 {
   // this is only called inside a region, use impl
   using policy::sequential::forall_impl;
@@ -626,15 +662,21 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForall::operator()(T const& seg
 
 constexpr CallForallIcount::CallForallIcount(int s) : start(s) {}
 
-template <typename T, typename ExecutionPolicy, typename LoopBody, typename Res, typename ForallParams>
-RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T const& segment,
-                                                                     ExecutionPolicy,
-                                                                     LoopBody body,
-                                                                     Res r,
-                                                                     ForallParams f_params) const
+template<typename T,
+         typename ExecutionPolicy,
+         typename LoopBody,
+         typename Res,
+         typename ForallParams>
+RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(
+    T const& segment,
+    ExecutionPolicy,
+    LoopBody body,
+    Res r,
+    ForallParams f_params) const
 {
   // go through wrap to unwrap icount
-  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body, f_params);
+  return wrap::forall_Icount(r, ExecutionPolicy(), segment, start, body,
+                             f_params);
 }
 
 }  // namespace detail
@@ -650,100 +692,112 @@ RAJA_INLINE camp::resources::EventProxy<Res> CallForallIcount::operator()(T cons
 namespace expt
 {
 
-  template<camp::idx_t IDX, typename POLICY_LIST>
-  struct dynamic_helper
+template<camp::idx_t IDX, typename POLICY_LIST>
+struct dynamic_helper
+{
+  template<typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (IDX == pol)
     {
-      if(IDX==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<IDX>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+  template<typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int pol,
+      SEGMENT const& seg,
+      BODY const& body)
+  {
 
-      if(IDX==pol){
-        RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<IDX>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-        //Return a generic event proxy from r,
-        //because forall returns a typed event proxy
-        return {r};
-      }
+    if (IDX == pol)
+    {
+      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      return dynamic_helper<IDX-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+      // Return a generic event proxy from r,
+      // because forall returns a typed event proxy
+      return {r};
     }
 
-  };
+    return dynamic_helper<IDX - 1, POLICY_LIST>::invoke_forall(r, pol, seg,
+                                                               body);
+  }
+};
 
-  template<typename POLICY_LIST>
-  struct dynamic_helper<0, POLICY_LIST>
+template<typename POLICY_LIST>
+struct dynamic_helper<0, POLICY_LIST>
+{
+  template<typename SEGMENT, typename BODY>
+  static void invoke_forall(const int pol, SEGMENT const& seg, BODY const& body)
   {
-    template<typename SEGMENT, typename BODY>
-    static void
-    invoke_forall(const int pol, SEGMENT const &seg, BODY const &body)
+    if (0 == pol)
     {
-      if(0==pol){
-        using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-        RAJA::forall<t_pol>(seg, body);
-        return;
-      }
-      RAJA_ABORT_OR_THROW("Policy enum not supported ");
+      using t_pol = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+      RAJA::forall<t_pol>(seg, body);
+      return;
     }
+    RAJA_ABORT_OR_THROW("Policy enum not supported ");
+  }
 
-    template<typename SEGMENT, typename BODY>
-    static resources::EventProxy<resources::Resource>
-    invoke_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-    {
-      if(pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
+  template<typename SEGMENT, typename BODY>
+  static resources::EventProxy<resources::Resource> invoke_forall(
+      RAJA::resources::Resource r,
+      const int pol,
+      SEGMENT const& seg,
+      BODY const& body)
+  {
+    if (pol != 0) RAJA_ABORT_OR_THROW("Policy value out of range ");
 
-      using t_pol = typename camp::at<POLICY_LIST,camp::num<0>>::type;
-      using resource_type = typename resources::get_resource<t_pol>::type;
+    using t_pol         = typename camp::at<POLICY_LIST, camp::num<0>>::type;
+    using resource_type = typename resources::get_resource<t_pol>::type;
 
-      RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
+    RAJA::forall<t_pol>(r.get<resource_type>(), seg, body);
 
-      //Return a generic event proxy from r,
-      //because forall returns a typed event proxy
-      return {r};
-    }
+    // Return a generic event proxy from r,
+    // because forall returns a typed event proxy
+    return {r};
+  }
+};
 
-  };
+template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+void dynamic_forall(const int pol, SEGMENT const& seg, BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  void dynamic_forall(const int pol, SEGMENT const &seg, BODY const &body)
+  if (pol > N - 1)
   {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy enum not supported");
-    }
-    dynamic_helper<N-1, POLICY_LIST>::invoke_forall(pol, seg, body);
+    RAJA_ABORT_OR_THROW("Policy enum not supported");
   }
+  dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(pol, seg, body);
+}
 
-  template<typename POLICY_LIST, typename SEGMENT, typename BODY>
-  resources::EventProxy<resources::Resource>
-  dynamic_forall(RAJA::resources::Resource r, const int pol, SEGMENT const &seg, BODY const &body)
-  {
-    constexpr int N = camp::size<POLICY_LIST>::value;
-    static_assert(N > 0, "RAJA policy list must not be empty");
-
-    if(pol > N-1)  {
-      RAJA_ABORT_OR_THROW("Policy value out of range");
-    }
+template<typename POLICY_LIST, typename SEGMENT, typename BODY>
+resources::EventProxy<resources::Resource> dynamic_forall(
+    RAJA::resources::Resource r,
+    const int pol,
+    SEGMENT const& seg,
+    BODY const& body)
+{
+  constexpr int N = camp::size<POLICY_LIST>::value;
+  static_assert(N > 0, "RAJA policy list must not be empty");
 
-    return dynamic_helper<N-1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+  if (pol > N - 1)
+  {
+    RAJA_ABORT_OR_THROW("Policy value out of range");
   }
 
+  return dynamic_helper<N - 1, POLICY_LIST>::invoke_forall(r, pol, seg, body);
+}
+
 }  // namespace expt
 
 
diff --git a/include/RAJA/pattern/kernel.hpp b/include/RAJA/pattern/kernel.hpp
index 1875fe27d9..06387eb0e5 100644
--- a/include/RAJA/pattern/kernel.hpp
+++ b/include/RAJA/pattern/kernel.hpp
@@ -40,33 +40,32 @@ namespace RAJA
  *
  * This is just a list of RAJA::kernel statements.
  */
-template <typename... Stmts>
+template<typename... Stmts>
 using KernelPolicy = internal::StatementList<Stmts...>;
 
 
 ///
 /// Template list of argument indices
 ///
-template <camp::idx_t... ArgumentId>
+template<camp::idx_t... ArgumentId>
 using ArgList = camp::idx_seq<ArgumentId...>;
 
 
-template <typename T>
+template<typename T>
 struct IterableWrapperTuple;
 
-template <typename... Ts>
-struct IterableWrapperTuple<camp::tuple<Ts...>> {
+template<typename... Ts>
+struct IterableWrapperTuple<camp::tuple<Ts...>>
+{
 
-  using type =
-      camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
-                             typename camp::decay<Ts>::IndexType>...>;
+  using type = camp::tuple<RAJA::Span<typename camp::decay<Ts>::iterator,
+                                      typename camp::decay<Ts>::IndexType>...>;
 };
 
-
 namespace internal
 {
-template <class Tuple, camp::idx_t... I>
-RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
+template<class Tuple, camp::idx_t... I>
+RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple&& t,
                                                    camp::idx_seq<I...>)
     -> camp::tuple<RAJA::Span<
         typename camp::decay<
@@ -75,38 +74,38 @@ RAJA_INLINE constexpr auto make_wrapped_tuple_impl(Tuple &&t,
             camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType>...>
 {
   return camp::make_tuple(
-      RAJA::Span<
-          typename camp::decay<
-              camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
-          typename camp::decay<camp::tuple_element_t<I, camp::decay<Tuple>>>::
-              IndexType>{camp::get<I>(std::forward<Tuple>(t)).begin(),
-                         camp::get<I>(std::forward<Tuple>(t)).end()}...);
+      RAJA::Span<typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::iterator,
+                 typename camp::decay<
+                     camp::tuple_element_t<I, camp::decay<Tuple>>>::IndexType> {
+          camp::get<I>(std::forward<Tuple>(t)).begin(),
+          camp::get<I>(std::forward<Tuple>(t)).end()}...);
 }
 }  // namespace internal
 
-template <class Tuple>
-RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple &&t)
+template<class Tuple>
+RAJA_INLINE constexpr auto make_wrapped_tuple(Tuple&& t)
     -> decltype(internal::make_wrapped_tuple_impl(
         std::forward<Tuple>(t),
-        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{}))
+        camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {}))
 {
   return internal::make_wrapped_tuple_impl(
       std::forward<Tuple>(t),
-      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value>{});
+      camp::make_idx_seq_t<camp::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
-
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename ParamTuple,
-          typename Resource,
-          typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &&segments,
-                                                                  ParamTuple &&params,
-                                                                  Resource resource,
-                                                                  Bodies &&... bodies)
+template<typename PolicyType,
+         typename SegmentTuple,
+         typename ParamTuple,
+         typename Resource,
+         typename... Bodies>
+RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(
+    SegmentTuple&& segments,
+    ParamTuple&& params,
+    Resource resource,
+    Bodies&&... bodies)
 {
-  util::PluginContext context{util::make_context<PolicyType>()};
+  util::PluginContext context {util::make_context<PolicyType>()};
 
   // TODO: test that all policy members model the Executor policy concept
   // TODO: add a static_assert for functors which cannot be invoked with
@@ -119,10 +118,8 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
 
   using param_tuple_t = camp::decay<ParamTuple>;
 
-  using loop_data_t = internal::LoopData<segment_tuple_t,
-                                         param_tuple_t,
-                                         Resource,
-                                         camp::decay<Bodies>...>;
+  using loop_data_t = internal::LoopData<segment_tuple_t, param_tuple_t,
+                                         Resource, camp::decay<Bodies>...>;
 
 
   util::callPreCapturePlugins(context);
@@ -131,11 +128,10 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   // our segments, loop bodies, and the tuple of loop indices
   // it is passed through all of the kernel mechanics by-referenece,
   // and only copied to provide thread-private instances.
-  loop_data_t loop_data(make_wrapped_tuple(
-                            std::forward<SegmentTuple>(segments)),
-                            std::forward<ParamTuple>(params),
-                            resource,
-                            std::forward<Bodies>(bodies)...);
+  loop_data_t loop_data(
+      make_wrapped_tuple(std::forward<SegmentTuple>(segments)),
+      std::forward<ParamTuple>(params), resource,
+      std::forward<Bodies>(bodies)...);
 
   util::callPostCapturePlugins(context);
 
@@ -152,50 +148,46 @@ RAJA_INLINE resources::EventProxy<Resource> kernel_param_resource(SegmentTuple &
   return resources::EventProxy<Resource>(resource);
 }
 
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename Resource,
-          typename... Bodies>
-RAJA_INLINE resources::EventProxy<Resource> kernel_resource(SegmentTuple &&segments,
-                                                            Resource resource,
-                                                            Bodies &&... bodies)
+template<typename PolicyType,
+         typename SegmentTuple,
+         typename Resource,
+         typename... Bodies>
+RAJA_INLINE resources::EventProxy<Resource> kernel_resource(
+    SegmentTuple&& segments,
+    Resource resource,
+    Bodies&&... bodies)
 {
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 resource,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), resource,
+      std::forward<Bodies>(bodies)...);
 }
 
-template <typename PolicyType,
-          typename SegmentTuple,
-          typename ParamTuple,
-          typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel_param(SegmentTuple &&segments,
-                                                                                           ParamTuple &&params,
-                                                                                           Bodies &&... bodies)
+template<typename PolicyType,
+         typename SegmentTuple,
+         typename ParamTuple,
+         typename... Bodies>
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel_param(SegmentTuple&& segments, ParamTuple&& params, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 std::forward<ParamTuple>(params),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), std::forward<ParamTuple>(params),
+      res, std::forward<Bodies>(bodies)...);
 }
 
-template <typename PolicyType, typename SegmentTuple, typename... Bodies>
-RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>> kernel(SegmentTuple &&segments,
-                                                                                     Bodies &&... bodies)
+template<typename PolicyType, typename SegmentTuple, typename... Bodies>
+RAJA_INLINE resources::EventProxy<resources::resource_from_pol_t<PolicyType>>
+kernel(SegmentTuple&& segments, Bodies&&... bodies)
 {
   auto res = resources::get_default_resource<PolicyType>();
-  return RAJA::kernel_param_resource<PolicyType>(std::forward<SegmentTuple>(segments),
-                                                 RAJA::make_tuple(),
-                                                 res,
-                                                 std::forward<Bodies>(bodies)...);
+  return RAJA::kernel_param_resource<PolicyType>(
+      std::forward<SegmentTuple>(segments), RAJA::make_tuple(), res,
+      std::forward<Bodies>(bodies)...);
 }
 
 
 }  // end namespace RAJA
 
-
 #include "RAJA/pattern/kernel/Collapse.hpp"
 #include "RAJA/pattern/kernel/Conditional.hpp"
 #include "RAJA/pattern/kernel/For.hpp"
diff --git a/include/RAJA/pattern/kernel/Collapse.hpp b/include/RAJA/pattern/kernel/Collapse.hpp
index 8efb126397..5d4a0c2308 100644
--- a/include/RAJA/pattern/kernel/Collapse.hpp
+++ b/include/RAJA/pattern/kernel/Collapse.hpp
@@ -26,11 +26,11 @@ namespace statement
 {
 
 
-template <typename ExecPolicy, typename ForList, typename... EnclosedStmts>
+template<typename ExecPolicy, typename ForList, typename... EnclosedStmts>
 struct Collapse : public internal::ForList,
                   public internal::CollapseBase,
-                  public internal::Statement<ExecPolicy, EnclosedStmts...> {
-};
+                  public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 
 }  // namespace statement
diff --git a/include/RAJA/pattern/kernel/Conditional.hpp b/include/RAJA/pattern/kernel/Conditional.hpp
index 6b7875c4c2..32188c80a7 100644
--- a/include/RAJA/pattern/kernel/Conditional.hpp
+++ b/include/RAJA/pattern/kernel/Conditional.hpp
@@ -36,20 +36,20 @@ namespace statement
  * A RAJA::kernel statement that implements conditional control logic
  *
  */
-template <typename Condition, typename... EnclosedStmts>
-struct If : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
-
+template<typename Condition, typename... EnclosedStmts>
+struct If : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
 /*!
  * An expression that returns a compile time literal value.
  *
  */
-template <long value>
-struct Value {
+template<long value>
+struct Value
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const &)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static long eval(Data const&)
   {
     return value;
   }
@@ -59,11 +59,12 @@ struct Value {
  * An equality expression
  *
  */
-template <typename L, typename R>
-struct Equals {
+template<typename L, typename R>
+struct Equals
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) == R::eval(data);
   }
@@ -73,116 +74,117 @@ struct Equals {
  * A negated equality expression
  *
  */
-template <typename L, typename R>
-struct NotEquals {
+template<typename L, typename R>
+struct NotEquals
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) != R::eval(data);
   }
 };
 
-
 /*!
  * A logical OR expression
  *
  */
-template <typename L, typename R>
-struct Or {
+template<typename L, typename R>
+struct Or
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) || R::eval(data);
   }
 };
 
-
 /*!
  * A logical AND expression
  *
  */
-template <typename L, typename R>
-struct And {
+template<typename L, typename R>
+struct And
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) && R::eval(data);
   }
 };
 
-
 /*!
  * A less than expression
  *
  */
-template <typename L, typename R>
-struct LessThan {
+template<typename L, typename R>
+struct LessThan
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) < R::eval(data);
   }
 };
 
-
 /*!
  * A less or equals than expression
  *
  */
-template <typename L, typename R>
-struct LessThanEq {
+template<typename L, typename R>
+struct LessThanEq
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) <= R::eval(data);
   }
 };
 
-
 /*!
  * A greater than expression
  *
  */
-template <typename L, typename R>
-struct GreaterThan {
+template<typename L, typename R>
+struct GreaterThan
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) > R::eval(data);
   }
 };
 
-
 /*!
  * A greater or equals than expression
  *
  */
-template <typename L, typename R>
-struct GreaterThanEq {
+template<typename L, typename R>
+struct GreaterThanEq
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return L::eval(data) >= R::eval(data);
   }
 };
 
-
 /*!
  * A negation expression
  *
  */
-template <typename L>
-struct Not {
+template<typename L>
+struct Not
+{
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static bool eval(Data const& data)
   {
     return !(L::eval(data));
   }
@@ -195,15 +197,17 @@ namespace internal
 {
 
 
-template <typename Condition, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types> {
+template<typename Condition, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::If<Condition, EnclosedStmts...>, Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
-    if (Condition::eval(data)) {
+    if (Condition::eval(data))
+    {
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(
           std::forward<Data>(data));
     }
diff --git a/include/RAJA/pattern/kernel/For.hpp b/include/RAJA/pattern/kernel/For.hpp
index 539c451673..a4684236f4 100644
--- a/include/RAJA/pattern/kernel/For.hpp
+++ b/include/RAJA/pattern/kernel/For.hpp
@@ -37,12 +37,13 @@ namespace statement
  * Assigns the loop iterate to argument ArgumentId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ExecPolicy = camp::nil,
-          typename... EnclosedStmts>
+template<camp::idx_t ArgumentId,
+         typename ExecPolicy = camp::nil,
+         typename... EnclosedStmts>
 struct For : public internal::ForList,
              public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+             public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   // TODO: add static_assert for valid policy in Pol
   using execution_policy_t = ExecPolicy;
@@ -59,14 +60,18 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename Data,
+         typename Types,
+         typename... EnclosedStmts>
+struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<ForWrapper>;
 
-  template <typename InIndexType>
+  template<typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
   {
     Base::data.template assign_offset<ArgumentId>(i);
@@ -74,22 +79,23 @@ struct ForWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
   }
 };
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::For
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t ArgumentId,
+         typename ExecPolicy,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -98,12 +104,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = data.res;
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
@@ -112,15 +119,14 @@ struct StatementExecutor<
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<
-    statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types> {
+template<camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+struct StatementExecutor<statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+                         Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
@@ -129,12 +135,13 @@ struct StatementExecutor<
     // Create a wrapper, just in case forall_impl needs to thread_privatize
     ForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     RAJA_EXTRACT_BED_IT(TypedRangeSegment<len_t>(0, len));
 
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
       for_wrapper(*(begin_it + i));
     }
   }
diff --git a/include/RAJA/pattern/kernel/ForICount.hpp b/include/RAJA/pattern/kernel/ForICount.hpp
index 18515c7f59..3276c6e0e8 100644
--- a/include/RAJA/pattern/kernel/ForICount.hpp
+++ b/include/RAJA/pattern/kernel/ForICount.hpp
@@ -39,13 +39,14 @@ namespace statement
  * Assigns the loop index to param ParamId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename ExecPolicy = camp::nil,
-          typename... EnclosedStmts>
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename ExecPolicy = camp::nil,
+         typename... EnclosedStmts>
 struct ForICount : public internal::ForList,
-             public internal::ForTraitBase<ArgumentId, ExecPolicy>,
-             public internal::Statement<ExecPolicy, EnclosedStmts...> {
+                   public internal::ForTraitBase<ArgumentId, ExecPolicy>,
+                   public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
@@ -64,15 +65,19 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
-          typename... EnclosedStmts>
-struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Data,
+         typename Types,
+         typename... EnclosedStmts>
+struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<ForICountWrapper>;
 
-  template <typename InIndexType>
+  template<typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
   {
     Base::data.template assign_offset<ArgumentId>(i);
@@ -81,38 +86,40 @@ struct ForICountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
   }
 };
 
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::ForICount
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename ExecPolicy,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>, Types> {
+    statement::ForICount<ArgumentId, ParamId, ExecPolicy, EnclosedStmts...>,
+    Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
     // Create a wrapper, just in case forall_impl needs to thread_privatize
-    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes,
-                     EnclosedStmts...> for_wrapper(data);
+    ForICountWrapper<ArgumentId, ParamId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::get_resource<ExecPolicy>::type::get_default();
 
-    forall_impl(r, ExecPolicy{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, ExecPolicy {}, TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Hyperplane.hpp b/include/RAJA/pattern/kernel/Hyperplane.hpp
index 955afcecc0..41e43333a9 100644
--- a/include/RAJA/pattern/kernel/Hyperplane.hpp
+++ b/include/RAJA/pattern/kernel/Hyperplane.hpp
@@ -76,15 +76,13 @@ namespace statement
  *  });
  *
  */
-template <camp::idx_t HpArgumentId,
-          typename HpExecPolicy,
-          typename ArgList,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
-struct Hyperplane
-    : public internal::Statement<ExecPolicy,
-                                 EnclosedStmts...> {
-};
+template<camp::idx_t HpArgumentId,
+         typename HpExecPolicy,
+         typename ArgList,
+         typename ExecPolicy,
+         typename... EnclosedStmts>
+struct Hyperplane : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{};
 
 }  // end namespace statement
 
@@ -92,27 +90,27 @@ namespace internal
 {
 
 
-template <camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
-struct HyperplaneInner
-    : public internal::Statement<camp::nil, EnclosedStmts...> {
-};
-
+template<camp::idx_t HpArgumentId, typename ArgList, typename... EnclosedStmts>
+struct HyperplaneInner : public internal::Statement<camp::nil, EnclosedStmts...>
+{};
 
-template <camp::idx_t HpArgumentId,
-          typename HpExecPolicy,
-          camp::idx_t... Args,
-          typename ExecPolicy,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t HpArgumentId,
+         typename HpExecPolicy,
+         camp::idx_t... Args,
+         typename ExecPolicy,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Hyperplane<HpArgumentId,
                                                HpExecPolicy,
                                                ArgList<Args...>,
                                                ExecPolicy,
-                                               EnclosedStmts...>, Types> {
+                                               EnclosedStmts...>,
+                         Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get type of Hp arguments index
@@ -126,8 +124,7 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
     // Add a Collapse policy around our enclosed statements that will handle
     // the inner hyperplane loop's execution
     using kernel_policy = statement::Collapse<
-        ExecPolicy,
-        ArgList<Args...>,
+        ExecPolicy, ArgList<Args...>,
         HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>>;
 
     // Create a For-loop wrapper for the outer loop
@@ -135,9 +132,9 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
 
     // compute manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    idx_t hp_len = segment_length<HpArgumentId>(data) +
-                   foldl(RAJA::operators::plus<idx_t>(),
-                                 segment_length<Args>(data)...);
+    idx_t hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<idx_t>(), segment_length<Args>(data)...);
 
     /* Execute the outer loop over hyperplanes
      *
@@ -146,40 +143,40 @@ struct StatementExecutor<statement::Hyperplane<HpArgumentId,
      * arguments actual value (and restrict to valid hyperplane indices)
      */
     auto r = resources::get_resource<HpExecPolicy>::type::get_default();
-    forall_impl(r, HpExecPolicy{},
-                TypedRangeSegment<idx_t>(0, hp_len),
-                outer_wrapper,
-                RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, HpExecPolicy {}, TypedRangeSegment<idx_t>(0, hp_len),
+                outer_wrapper, RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
-
-template <camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t HpArgumentId,
+         camp::idx_t... Args,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>, Types> {
+    HyperplaneInner<HpArgumentId, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // get h value
-    auto h = camp::get<HpArgumentId>(data.offset_tuple);
+    auto h      = camp::get<HpArgumentId>(data.offset_tuple);
     using idx_t = decltype(h);
 
     // compute actual iterate for HpArgumentId
     // as:  i0 = h - (i1 + i2 + i3 + ...)
     idx_t i = h - foldl(RAJA::operators::plus<idx_t>(),
-                                camp::get<Args>(data.offset_tuple)...);
+                        camp::get<Args>(data.offset_tuple)...);
 
     // get length of Hp indexed argument
     auto len = segment_length<HpArgumentId>(data);
 
     // check bounds
-    if (i >= 0 && i < len) {
+    if (i >= 0 && i < len)
+    {
 
       // store in tuple
       data.template assign_offset<HpArgumentId>(i);
diff --git a/include/RAJA/pattern/kernel/InitLocalMem.hpp b/include/RAJA/pattern/kernel/InitLocalMem.hpp
index 21d9e3cd2a..92ca93579c 100644
--- a/include/RAJA/pattern/kernel/InitLocalMem.hpp
+++ b/include/RAJA/pattern/kernel/InitLocalMem.hpp
@@ -26,10 +26,9 @@
 namespace RAJA
 {
 
-//Policies for RAJA local arrays
+// Policies for RAJA local arrays
 struct cpu_tile_mem;
 
-
 namespace statement
 {
 
@@ -44,13 +43,15 @@ namespace statement
  * Will intialize the 0th array in the param tuple
  */
 template<typename Pol, typename Indices, typename... EnclosedStmts>
-struct InitLocalMem : public internal::Statement<camp::nil> {
-};
+struct InitLocalMem : public internal::Statement<camp::nil>
+{};
 
-//Policy Specialization
+// Policy Specialization
 template<camp::idx_t... Indices, typename... EnclosedStmts>
-struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts...> : public internal::Statement<camp::nil> {
-};
+struct InitLocalMem<RAJA::cpu_tile_mem,
+                    camp::idx_seq<Indices...>,
+                    EnclosedStmts...> : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -58,28 +59,33 @@ struct InitLocalMem<RAJA::cpu_tile_mem, camp::idx_seq<Indices...>, EnclosedStmts
 namespace internal
 {
 
-//Statement executor to initalize RAJA local array
+// Statement executor to initalize RAJA local array
 template<camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_seq<Indices...>, EnclosedStmts...>, Types>{
-  
-  //Execute statement list
+struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,
+                                                 camp::idx_seq<Indices...>,
+                                                 EnclosedStmts...>,
+                         Types>
+{
+
+  // Execute statement list
   template<class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
-  
-  //Intialize local array
-  //Identifies type + number of elements needed
+
+  // Intialize local array
+  // Identifies type + number of elements needed
   template<camp::idx_t Pos, camp::idx_t... others, class Data>
-  static void RAJA_INLINE exec_expanded(Data && data)
+  static void RAJA_INLINE exec_expanded(Data&& data)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
 
     // Initialize memory
 #ifdef RAJA_COMPILER_MSVC
     // MSVC doesn't like taking a pointer to stack allocated data?!?!
-    varType *ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
+    varType* ptr = new varType[camp::get<Pos>(data.param_tuple).size()];
     camp::get<Pos>(data.param_tuple).set_data(ptr);
 #else
     varType Array[camp::get<Pos>(data.param_tuple).size()];
@@ -95,16 +101,13 @@ struct StatementExecutor<statement::InitLocalMem<RAJA::cpu_tile_mem,camp::idx_se
     delete[] ptr;
 #endif
   }
-  
 
-  
   template<typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  static RAJA_INLINE void exec(Data&& data)
   {
-    //Initalize local arrays + execute statements + cleanup
+    // Initalize local arrays + execute statements + cleanup
     exec_expanded<Indices...>(data);
   }
-  
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Lambda.hpp b/include/RAJA/pattern/kernel/Lambda.hpp
index 29d41b431e..4cbf67f72a 100644
--- a/include/RAJA/pattern/kernel/Lambda.hpp
+++ b/include/RAJA/pattern/kernel/Lambda.hpp
@@ -49,25 +49,24 @@ struct lambda_arg_offset_t
 template<typename T>
 struct lambda_arg_value_t
 {
-    using type = T;
+  using type = T;
 };
 
 template<typename T, camp::idx_t V>
 struct LambdaArg
 {
-    static constexpr camp::idx_t value = V;
+  static constexpr camp::idx_t value = V;
 };
 
-}
-
-
+}  // namespace internal
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment values
  * should be passed into the lambda as an argument
  */
-template<camp::idx_t ... args>
-using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
+template<camp::idx_t... args>
+using Segs =
+    camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more segment offsets
@@ -79,16 +78,18 @@ using Segs = camp::list<internal::LambdaArg<internal::lambda_arg_seg_t, args>...
  * In the case of tiling (with Tile) the offset is w.r.t. the beginning of the
  * current tile.
  */
-template<camp::idx_t ... args>
-using Offsets = camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
+template<camp::idx_t... args>
+using Offsets =
+    camp::list<internal::LambdaArg<internal::lambda_arg_offset_t, args>...>;
 
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more parameters that
  * should be passed into the lambda as an argument.
  */
-template<camp::idx_t ... args>
-using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
+template<camp::idx_t... args>
+using Params =
+    camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args>...>;
 
 /*!
  * Used in RAJA::statement::Lambda to specify that one or more constant values
@@ -103,9 +104,9 @@ using Params = camp::list<internal::LambdaArg<internal::lambda_arg_param_t, args
  * writing:   Lambda<0, ValuesT<double, 3, 4>>
  * invokes:   lambda0( (double)3, (double) 4 )
  */
-template<typename T, camp::idx_t ... values>
-using ValuesT = camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
-
+template<typename T, camp::idx_t... values>
+using ValuesT =
+    camp::list<internal::LambdaArg<internal::lambda_arg_value_t<T>, values>...>;
 
 namespace statement
 {
@@ -119,8 +120,9 @@ namespace statement
  * RAJA::kernel<exec_pol>(make_tuple{s0, s1, s2}, lambda0, lambda1);
  *
  */
-template <camp::idx_t BodyIdx, typename... Args >
-struct Lambda : internal::Statement<camp::nil> {
+template<camp::idx_t BodyIdx, typename... Args>
+struct Lambda : internal::Statement<camp::nil>
+{
   static const camp::idx_t loop_body_index = BodyIdx;
 };
 
@@ -130,13 +132,6 @@ namespace internal
 {
 
 
-
-
-
-
-
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -150,22 +145,18 @@ template<typename SegmentType, camp::idx_t id>
 struct LambdaSegExtractor
 {
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return SegmentType(camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)]);
+    return SegmentType(camp::get<id>(data.segment_tuple)
+                           .begin()[camp::get<id>(data.offset_tuple)]);
   }
-
 };
 
-
-
 /*
  * Helper that extracts a segment value for a lambda argument
  *
@@ -179,22 +170,18 @@ template<typename OffsetType, camp::idx_t id>
 struct LambdaOffsetExtractor
 {
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Segment not assigned, but used in Lambda with Offsets<> argument");
 
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
     return OffsetType(camp::get<id>(data.offset_tuple));
   }
-
 };
 
 
-
 /*
  * Helper that provides first level of argument extraction
  * This acts as a switchboard between Segs, Offsets, and Params
@@ -205,25 +192,22 @@ struct LambdaOffsetExtractor
 template<typename Types, typename T>
 struct LambdaArgSwitchboard;
 
-
 template<typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_offset_t, id>>
 {
 
   using OffsetType = camp::at_v<typename Types::offset_types_t, id>;
 
-  static_assert(!std::is_same<OffsetType, void>::value,
+  static_assert(
+      !std::is_same<OffsetType, void>::value,
       "Offset not assigned, but used in Lambda with Offsets<> argument");
 
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static OffsetType extract(Data &&data)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static OffsetType extract(Data&& data)
   {
-    return LambdaOffsetExtractor<OffsetType, id>::extract(std::forward<Data>(data));
+    return LambdaOffsetExtractor<OffsetType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
 template<typename Types, camp::idx_t id>
@@ -232,110 +216,108 @@ struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_seg_t, id>>
 
   using SegmentType = camp::at_v<typename Types::segment_types_t, id>;
 
-  static_assert(!std::is_same<SegmentType, void>::value,
+  static_assert(
+      !std::is_same<SegmentType, void>::value,
       "Segment not assigned, but used in Lambda with Segs<> argument");
 
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static SegmentType extract(Data &&data)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static SegmentType extract(Data&& data)
   {
-    return LambdaSegExtractor<SegmentType, id>::extract(std::forward<Data>(data));
+    return LambdaSegExtractor<SegmentType, id>::extract(
+        std::forward<Data>(data));
   }
-
 };
 
 template<typename Types, camp::idx_t id>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_param_t, id>>
 {
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static auto extract(Data &&data)->
-    typename std::add_lvalue_reference<camp::tuple_element_t<id,typename camp::decay<Data>::param_tuple_t>>::type
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static auto extract(Data&& data) ->
+      typename std::add_lvalue_reference<camp::tuple_element_t<
+          id,
+          typename camp::decay<Data>::param_tuple_t>>::type
   {
     return camp::get<id>(data.param_tuple);
   }
 };
 
-
 template<typename Types, typename T, camp::idx_t value>
 struct LambdaArgSwitchboard<Types, LambdaArg<lambda_arg_value_t<T>, value>>
 {
   template<typename Data>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  static T extract(Data &&)
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static T extract(Data&&)
   {
     return T(value);
   }
 };
 
-
-
 RAJA_SUPPRESS_HD_WARN
-template<camp::idx_t LoopIndex, typename Types, typename Data, typename... targLists>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(Data &&data,
-                                                       camp::list<targLists...> const &)
+template<camp::idx_t LoopIndex,
+         typename Types,
+         typename Data,
+         typename... targLists>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda_with_args(
+    Data&& data,
+    camp::list<targLists...> const&)
 {
   camp::get<LoopIndex>(data.bodies)(
       LambdaArgSwitchboard<Types, targLists>::extract(data)...);
 }
 
-
-
-
 /*!
  * A RAJA::kernel statement that invokes a lambda function
  * with user specified arguments.
  */
-template <camp::idx_t LambdaIndex,typename... Args, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types> {
+template<camp::idx_t LambdaIndex, typename... Args, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    //Convert SegList, ParamList into Seg, Param types, and store in a list
+    // Convert SegList, ParamList into Seg, Param types, and store in a list
     using targList = typename camp::flatten<camp::list<Args...>>::type;
 
-    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data), targList{});
+    invoke_lambda_with_args<LambdaIndex, Types>(std::forward<Data>(data),
+                                                targList {});
   }
 };
 
-
-
-template <camp::idx_t LambdaIndex, typename Types, typename Data, camp::idx_t ... SEGS, camp::idx_t ... PARAMS>
-RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data &&data, camp::idx_seq<SEGS...> const &, camp::idx_seq<PARAMS...> const &)
+template<camp::idx_t LambdaIndex,
+         typename Types,
+         typename Data,
+         camp::idx_t... SEGS,
+         camp::idx_t... PARAMS>
+RAJA_INLINE RAJA_HOST_DEVICE void invoke_lambda(Data&& data,
+                                                camp::idx_seq<SEGS...> const&,
+                                                camp::idx_seq<PARAMS...> const&)
 {
 
-  using AllSegs = Segs<SEGS...>;
+  using AllSegs   = Segs<SEGS...>;
   using AllParams = Params<PARAMS...>;
 
   // invoke the expanded Lambda executor, passing in all segments and params
-  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>, Types>::exec(std::forward<Data>(data));
+  StatementExecutor<statement::Lambda<LambdaIndex, AllSegs, AllParams>,
+                    Types>::exec(std::forward<Data>(data));
 }
 
+template<camp::idx_t LambdaIndex, typename Types>
+struct StatementExecutor<statement::Lambda<LambdaIndex>, Types>
+{
 
-template <camp::idx_t LambdaIndex, typename Types>
-struct StatementExecutor<statement::Lambda<LambdaIndex>, Types> {
-
-  template <typename Data>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(Data&& data)
   {
 
-    using Data_t = camp::decay<Data>;
+    using Data_t         = camp::decay<Data>;
     using offset_tuple_t = typename Data_t::offset_tuple_t;
-    using param_tuple_t = typename Data_t::param_tuple_t;
+    using param_tuple_t  = typename Data_t::param_tuple_t;
 
     invoke_lambda<LambdaIndex, Types>(
         std::forward<Data>(data),
-        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value>{},
-        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value>{});
-
+        camp::make_idx_seq_t<camp::tuple_size<offset_tuple_t>::value> {},
+        camp::make_idx_seq_t<camp::tuple_size<param_tuple_t>::value> {});
   }
 };
 
diff --git a/include/RAJA/pattern/kernel/Param.hpp b/include/RAJA/pattern/kernel/Param.hpp
index 8e870ebe15..49e8de5710 100644
--- a/include/RAJA/pattern/kernel/Param.hpp
+++ b/include/RAJA/pattern/kernel/Param.hpp
@@ -31,10 +31,10 @@ namespace RAJA
 namespace internal
 {
 
-struct ParamBase {
-};
+struct ParamBase
+{};
 
-}// end namespace internal
+}  // end namespace internal
 
 namespace statement
 {
@@ -46,13 +46,14 @@ namespace statement
  * This allows run-time values to affect the control logic within
  * RAJA::kernel execution policies.
  */
-template <camp::idx_t ParamId>
-struct Param : public internal::ParamBase {
+template<camp::idx_t ParamId>
+struct Param : public internal::ParamBase
+{
 
   constexpr static camp::idx_t param_idx = ParamId;
 
-  template <typename Data>
-  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const &data)
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE static auto eval(Data const& data)
       -> decltype(camp::get<ParamId>(data.param_tuple))
   {
     return camp::get<ParamId>(data.param_tuple);
diff --git a/include/RAJA/pattern/kernel/Reduce.hpp b/include/RAJA/pattern/kernel/Reduce.hpp
index 4de4922ea3..e2ee22cccb 100644
--- a/include/RAJA/pattern/kernel/Reduce.hpp
+++ b/include/RAJA/pattern/kernel/Reduce.hpp
@@ -38,11 +38,13 @@ namespace statement
  * the enclosed statements on the thread which contains the reduced value.
  *
  */
-template <typename ReducePolicy,
-          template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts>
-struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...> {
+template<typename ReducePolicy,
+         template<typename...>
+         class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts>
+struct Reduce : public internal::Statement<camp::nil, EnclosedStmts...>
+{
 
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
diff --git a/include/RAJA/pattern/kernel/Region.hpp b/include/RAJA/pattern/kernel/Region.hpp
index 82b79ae775..9f15e17963 100644
--- a/include/RAJA/pattern/kernel/Region.hpp
+++ b/include/RAJA/pattern/kernel/Region.hpp
@@ -31,8 +31,8 @@ namespace statement
 {
 
 template<typename RegionPolicy, typename... EnclosedStmts>
-struct Region : public internal::Statement<camp::nil> {
-};
+struct Region : public internal::Statement<camp::nil>
+{};
 
 
 }  // end namespace statement
@@ -40,23 +40,24 @@ struct Region : public internal::Statement<camp::nil> {
 namespace internal
 {
 
-//Statement executor to create a region within kernel
+// Statement executor to create a region within kernel
 
-//Note: RAJA region's lambda must capture by reference otherwise
-//internal function calls are undefined.
+// Note: RAJA region's lambda must capture by reference otherwise
+// internal function calls are undefined.
 template<typename RegionPolicy, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>, Types> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&data)
+struct StatementExecutor<statement::Region<RegionPolicy, EnclosedStmts...>,
+                         Types>
 {
 
-  RAJA::region<RegionPolicy>([&]() {
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
+  {
+
+    RAJA::region<RegionPolicy>([&]() {
       using data_t = camp::decay<Data>;
       execute_statement_list<camp::list<EnclosedStmts...>, Types>(data_t(data));
     });
-}
-
+  }
 };
 
 
diff --git a/include/RAJA/pattern/kernel/Tile.hpp b/include/RAJA/pattern/kernel/Tile.hpp
index 43f72e0545..a65d6b326d 100644
--- a/include/RAJA/pattern/kernel/Tile.hpp
+++ b/include/RAJA/pattern/kernel/Tile.hpp
@@ -34,14 +34,14 @@
 namespace RAJA
 {
 
-struct TileSize {
+struct TileSize
+{
   const camp::idx_t size;
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  constexpr TileSize(camp::idx_t size_) : size{size_}
-  {
-  }
+  constexpr TileSize(camp::idx_t size_) : size {size_} {}
 };
 
 namespace statement
@@ -52,11 +52,12 @@ namespace statement
  * A RAJA::kernel statement that implements a tiling (or blocking) loop.
  *
  */
-template <camp::idx_t ArgumentId,
-          typename TilePolicy,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
-struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename TilePolicy,
+         typename ExecPolicy,
+         typename... EnclosedStmts>
+struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   using tile_policy_t = TilePolicy;
   using exec_policy_t = ExecPolicy;
 };
@@ -64,18 +65,18 @@ struct Tile : public internal::Statement<ExecPolicy, EnclosedStmts...> {
 }  // end namespace statement
 
 ///! tag for a tiling loop
-template <camp::idx_t chunk_size_>
-struct tile_fixed {
+template<camp::idx_t chunk_size_>
+struct tile_fixed
+{
   static constexpr camp::idx_t chunk_size = chunk_size_;
 };
 
-template <camp::idx_t ArgumentId>
-struct tile_dynamic {
+template<camp::idx_t ArgumentId>
+struct tile_dynamic
+{
   static constexpr camp::idx_t id = ArgumentId;
 };
 
-
-
 namespace internal
 {
 
@@ -84,14 +85,18 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename Data,
+         typename Types,
+         typename... EnclosedStmts>
+struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<TileWrapper>;
 
-  template <typename InSegmentIndexType>
+  template<typename InSegmentIndexType>
   RAJA_INLINE void operator()(InSegmentIndexType si)
   {
     // Assign the tile's segment to the tuple
@@ -102,9 +107,9 @@ struct TileWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
   }
 };
 
-
-template <typename Iterable>
-struct IterableTiler {
+template<typename Iterable>
+struct IterableTiler
+{
   using value_type = camp::decay<Iterable>;
 
   struct iterate
@@ -120,46 +125,48 @@ struct IterableTiler {
     const Index_type block_id;
 
   public:
-    using value_type = iterate;
-    using difference_type = camp::idx_t;
-    using pointer = value_type *;
-    using reference = value_type &;
+    using value_type        = iterate;
+    using difference_type   = camp::idx_t;
+    using pointer           = value_type*;
+    using reference         = value_type&;
     using iterator_category = std::random_access_iterator_tag;
 
     RAJA_HOST_DEVICE
+
     RAJA_INLINE
-    constexpr iterator(IterableTiler const &itiler_, Index_type block_id_)
-        : itiler{itiler_}, block_id{block_id_}
-    {
-    }
+    constexpr iterator(IterableTiler const& itiler_, Index_type block_id_)
+        : itiler {itiler_},
+          block_id {block_id_}
+    {}
 
     RAJA_HOST_DEVICE
+
     RAJA_INLINE
     value_type operator*()
     {
       auto start = block_id * itiler.block_size;
-      return iterate{itiler.it.slice(start, itiler.block_size), block_id};
+      return iterate {itiler.it.slice(start, itiler.block_size), block_id};
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE difference_type operator-(const iterator &rhs) const
+    RAJA_INLINE difference_type operator-(const iterator& rhs) const
     {
       return static_cast<difference_type>(block_id) -
              static_cast<difference_type>(rhs.block_id);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator-(const difference_type &rhs) const
+    RAJA_INLINE iterator operator-(const difference_type& rhs) const
     {
       return iterator(itiler, block_id - rhs);
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE iterator operator+(const difference_type &rhs) const
+    RAJA_INLINE iterator operator+(const difference_type& rhs) const
     {
-      return iterator(itiler,
-                      block_id + rhs >= itiler.num_blocks ? itiler.num_blocks
-                                                          : block_id + rhs);
+      return iterator(itiler, block_id + rhs >= itiler.num_blocks
+                                  ? itiler.num_blocks
+                                  : block_id + rhs);
     }
 
     RAJA_HOST_DEVICE
@@ -169,39 +176,44 @@ struct IterableTiler {
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator!=(const iterator &rhs) const
+    RAJA_INLINE bool operator!=(const iterator& rhs) const
     {
       return block_id != rhs.block_id;
     }
 
     RAJA_HOST_DEVICE
-    RAJA_INLINE bool operator<(const iterator &rhs) const
+    RAJA_INLINE bool operator<(const iterator& rhs) const
     {
       return block_id < rhs.block_id;
     }
   };
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  IterableTiler(const Iterable &it_, camp::idx_t block_size_)
-      : it{it_}, block_size{block_size_}
+  IterableTiler(const Iterable& it_, camp::idx_t block_size_)
+      : it {it_},
+        block_size {block_size_}
   {
     using std::begin;
     using std::distance;
     using std::end;
-    dist = it.end() - it.begin();  // distance(begin(it), end(it));
+    dist       = it.end() - it.begin();  // distance(begin(it), end(it));
     num_blocks = dist / block_size;
     // if (dist % block_size) num_blocks += 1;
-    if (dist - num_blocks * block_size > 0) {
+    if (dist - num_blocks * block_size > 0)
+    {
       num_blocks += 1;
     }
   }
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
   iterator begin() const { return iterator(*this, 0); }
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
   iterator end() const { return iterator(*this, num_blocks); }
 
@@ -216,19 +228,21 @@ struct IterableTiler {
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          camp::idx_t ChunkSize,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t ArgumentId,
+         camp::idx_t ChunkSize,
+         typename EPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>, Types> {
+    statement::Tile<ArgumentId, tile_fixed<ChunkSize>, EPol, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = tile_fixed<ChunkSize>::chunk_size;
@@ -238,12 +252,12 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
@@ -251,34 +265,38 @@ struct StatementExecutor<
 };
 
 template<camp::idx_t ArgumentId,
-  typename EPol,
-  typename... EnclosedStmts,
-  typename Types>
+         typename EPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>, Types> {
+    statement::
+        Tile<ArgumentId, tile_dynamic<ArgumentId>, EPol, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = camp::get<ArgumentId>(data.param_tuple);
-    static_assert(camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
-                  "Extracted parameter must be of type TileSize.");
+    static_assert(
+        camp::concepts::metalib::is_same<TileSize, decltype(chunk_size)>::value,
+        "Extracted parameter must be of type TileSize.");
 
     // Create a tile iterator
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size.size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileWrapper<ArgumentId, Data, Types,
-                EnclosedStmts...> tile_wrapper(data);
+    TileWrapper<ArgumentId, Data, Types, EnclosedStmts...> tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
-    
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
+
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
   }
diff --git a/include/RAJA/pattern/kernel/TileTCount.hpp b/include/RAJA/pattern/kernel/TileTCount.hpp
index 2653e992c7..293d1730a2 100644
--- a/include/RAJA/pattern/kernel/TileTCount.hpp
+++ b/include/RAJA/pattern/kernel/TileTCount.hpp
@@ -42,12 +42,13 @@ namespace statement
  * Assigns the tile index to param ParamId
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TilePolicy,
-          typename ExecPolicy,
-          typename... EnclosedStmts>
-struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename TilePolicy,
+         typename ExecPolicy,
+         typename... EnclosedStmts>
+struct TileTCount : public internal::Statement<ExecPolicy, EnclosedStmts...>
+{
   static_assert(std::is_base_of<internal::ParamBase, ParamId>::value,
                 "Inappropriate ParamId, ParamId must be of type "
                 "RAJA::Statement::Param< # >");
@@ -66,50 +67,54 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId, typename Data, typename Types,
-          typename... EnclosedStmts>
-struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...> {
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Data,
+         typename Types,
+         typename... EnclosedStmts>
+struct TileTCountWrapper : public GenericWrapper<Data, Types, EnclosedStmts...>
+{
 
   using Base = GenericWrapper<Data, Types, EnclosedStmts...>;
   using Base::Base;
   using privatizer = NestedPrivatizer<TileTCountWrapper>;
 
-  template <typename InSegmentIndexType>
+  template<typename InSegmentIndexType>
   RAJA_INLINE void operator()(InSegmentIndexType si)
   {
     // Assign the tile's segment to the tuple
     camp::get<ArgumentId>(Base::data.segment_tuple) = si.s;
-    
+
     // Assign the tile's index
     Base::data.template assign_param<ParamId>(si.i);
-    
+
     // Execute enclosed statements
     Base::exec();
   }
 };
 
-
-
 /*!
  * A generic RAJA::kernel forall_impl executor for statement::TileTCount
  *
  *
  */
-template <camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename EPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename TPol,
+         typename EPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>, Types> {
+    statement::TileTCount<ArgumentId, ParamId, TPol, EPol, EnclosedStmts...>,
+    Types>
+{
 
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
     // Get the segment we are going to tile
-    auto const &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto const& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Get the tiling policies chunk size
     auto chunk_size = TPol::chunk_size;
@@ -119,12 +124,13 @@ struct StatementExecutor<
     IterableTiler<decltype(segment)> tiled_iterable(segment, chunk_size);
 
     // Wrap in case forall_impl needs to thread_privatize
-    TileTCountWrapper<ArgumentId, ParamId, Data, Types,
-                      EnclosedStmts...> tile_wrapper(data);
+    TileTCountWrapper<ArgumentId, ParamId, Data, Types, EnclosedStmts...>
+        tile_wrapper(data);
 
     // Loop over tiles, executing enclosed statement list
     auto r = resources::get_resource<EPol>::type::get_default();
-    forall_impl(r, EPol{}, tiled_iterable, tile_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, EPol {}, tiled_iterable, tile_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
 
     // Set range back to original values
     camp::get<ArgumentId>(data.segment_tuple) = tiled_iterable.it;
diff --git a/include/RAJA/pattern/kernel/internal/LoopData.hpp b/include/RAJA/pattern/kernel/internal/LoopData.hpp
index 9667a55538..5b57a01070 100644
--- a/include/RAJA/pattern/kernel/internal/LoopData.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopData.hpp
@@ -40,73 +40,71 @@ namespace internal
 {
 
 
+// Universal base of all For wrappers for type traits
+struct ForList
+{};
 
+struct ForBase
+{};
 
-  // Universal base of all For wrappers for type traits
-  struct ForList {
-  };
-  struct ForBase {
-  };
-  struct CollapseBase {
-  };
-  template <camp::idx_t ArgumentId, typename Policy>
-  struct ForTraitBase : public ForBase {
-    constexpr static camp::idx_t index_val = ArgumentId;
-    using index = camp::num<ArgumentId>;
-    using index_type = camp::nil;  // default to invalid type
-    using policy_type = Policy;
-    using type = ForTraitBase;  // make camp::value compatible
-  };
-
-
+struct CollapseBase
+{};
 
+template<camp::idx_t ArgumentId, typename Policy>
+struct ForTraitBase : public ForBase
+{
+  constexpr static camp::idx_t index_val = ArgumentId;
+  using index                            = camp::num<ArgumentId>;
+  using index_type  = camp::nil;  // default to invalid type
+  using policy_type = Policy;
+  using type        = ForTraitBase;  // make camp::value compatible
+};
 
-template <typename Iterator>
-struct iterable_difftype_getter {
+template<typename Iterator>
+struct iterable_difftype_getter
+{
   using type = typename std::iterator_traits<
       typename Iterator::iterator>::difference_type;
 };
 
-template <typename Segments>
+template<typename Segments>
 using difftype_list_from_segments =
     typename camp::transform<iterable_difftype_getter, Segments>::type;
 
 
-template <typename Segments>
+template<typename Segments>
 using difftype_tuple_from_segments =
     typename camp::apply_l<camp::lambda<camp::tuple>,
                            difftype_list_from_segments<Segments>>::type;
 
-
-template <typename Iterator>
-struct iterable_value_type_getter {
+template<typename Iterator>
+struct iterable_value_type_getter
+{
   using type =
       typename std::iterator_traits<typename Iterator::iterator>::value_type;
 };
 
-template <typename Segments>
+template<typename Segments>
 using value_type_list_from_segments =
     typename camp::transform<iterable_value_type_getter, Segments>::type;
 
 
-template <typename Segments>
+template<typename Segments>
 using index_tuple_from_segments =
     typename camp::apply_l<camp::lambda<camp::tuple>,
                            value_type_list_from_segments<Segments>>::type;
 
-template <typename Segments>
+template<typename Segments>
 using index_types_from_segments =
     typename camp::apply_l<camp::lambda<camp::list>,
                            value_type_list_from_segments<Segments>>::type;
 
-
-
-
-template <typename SegmentTuple,
-          typename ParamTuple,
-          typename Resource,
-          typename... Bodies>
-struct LoopData {
+template<typename SegmentTuple,
+         typename ParamTuple,
+         typename Resource,
+         typename... Bodies>
+struct LoopData
+{
 
   using Self = LoopData<SegmentTuple, ParamTuple, Resource, Bodies...>;
 
@@ -138,105 +136,97 @@ struct LoopData {
   using vector_sizes_t = tuple_of_n<int, camp::tuple_size<SegmentTuple>::value>;
   vector_sizes_t vector_sizes;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  LoopData(SegmentTuple const &s, ParamTuple const &p, Resource r, Bodies const &... b)
-      : segment_tuple(s), param_tuple(p), res(r), bodies(b...)
-  {
-  }
-  constexpr LoopData(LoopData const &) = default;
-  constexpr LoopData(LoopData &&) = default;
-
-  template <camp::idx_t Idx, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const &i)
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr LoopData(SegmentTuple const& s,
+                                                  ParamTuple const& p,
+                                                  Resource r,
+                                                  Bodies const&... b)
+      : segment_tuple(s),
+        param_tuple(p),
+        res(r),
+        bodies(b...)
+  {}
+
+  constexpr LoopData(LoopData const&) = default;
+  constexpr LoopData(LoopData&&)      = default;
+
+  template<camp::idx_t Idx, typename IndexT>
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_offset(IndexT const& i)
   {
     camp::get<Idx>(offset_tuple) = i;
   }
 
-  template <typename ParamId, typename IndexT>
-  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const &i)
+  template<typename ParamId, typename IndexT>
+  RAJA_HOST_DEVICE RAJA_INLINE void assign_param(IndexT const& i)
   {
-    using param_t = camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
+    using param_t =
+        camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>;
     camp::get<ParamId::param_idx>(param_tuple) = param_t(i);
   }
 
-  template <typename ParamId>
-  RAJA_HOST_DEVICE RAJA_INLINE
-  auto get_param() ->
-    camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
+  template<typename ParamId>
+  RAJA_HOST_DEVICE RAJA_INLINE auto get_param()
+      -> camp::at_v<typename param_tuple_t::TList, ParamId::param_idx>
   {
     return camp::get<ParamId::param_idx>(param_tuple);
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  Resource get_resource()
-  {
-    return res;
-  }
-
-
+  RAJA_HOST_DEVICE RAJA_INLINE Resource get_resource() { return res; }
 };
 
+template<camp::idx_t ArgumentId, typename Data>
+using segment_diff_type = typename std::iterator_traits<
+    typename camp::at_v<typename Data::segment_tuple_t::TList,
+                        ArgumentId>::iterator>::difference_type;
 
-
-
-template <camp::idx_t ArgumentId, typename Data>
-using segment_diff_type =
-    typename std::iterator_traits<
-        typename camp::at_v<typename Data::segment_tuple_t::TList,
-                            ArgumentId>::iterator>::difference_type;
-
-
-
-
-template <camp::idx_t ArgumentId, typename Data>
-RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const &data) ->
-  segment_diff_type<ArgumentId, Data>
+template<camp::idx_t ArgumentId, typename Data>
+RAJA_INLINE RAJA_HOST_DEVICE auto segment_length(Data const& data)
+    -> segment_diff_type<ArgumentId, Data>
 {
   return camp::get<ArgumentId>(data.segment_tuple).end() -
          camp::get<ArgumentId>(data.segment_tuple).begin();
 }
 
-
-
-
-template <typename Data, typename Types, typename... EnclosedStmts>
-struct GenericWrapper : GenericWrapperBase {
+template<typename Data, typename Types, typename... EnclosedStmts>
+struct GenericWrapper : GenericWrapperBase
+{
   using data_t = camp::decay<Data>;
 
-  data_t &data;
+  data_t& data;
 
   RAJA_INLINE
-  constexpr explicit GenericWrapper(data_t &d) : data{d} {}
+  constexpr explicit GenericWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 };
 
-
 /*!
  * Convenience object used to create a thread-private LoopData object.
  */
-template <typename T>
-struct NestedPrivatizer {
-  using data_t = typename T::data_t;
-  using value_type = camp::decay<T>;
-  using reference_type = value_type &;
+template<typename T>
+struct NestedPrivatizer
+{
+  using data_t         = typename T::data_t;
+  using value_type     = camp::decay<T>;
+  using reference_type = value_type&;
 
   data_t privatized_data;
   value_type privatized_wrapper;
 
   RAJA_INLINE
-  constexpr NestedPrivatizer(const T &o)
-      : privatized_data{o.data}, privatized_wrapper(privatized_data)
-  {
-  }
+  constexpr NestedPrivatizer(const T& o)
+      : privatized_data {o.data},
+        privatized_wrapper(privatized_data)
+  {}
 
   RAJA_INLINE
   reference_type get_priv() { return privatized_wrapper; }
 };
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
index 7f77df4214..3312e8a76a 100644
--- a/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
+++ b/include/RAJA/pattern/kernel/internal/LoopTypes.hpp
@@ -22,70 +22,75 @@
 #include "RAJA/pattern/kernel/internal/Template.hpp"
 #include "camp/camp.hpp"
 
-
 namespace RAJA
 {
 namespace internal
 {
 
 
-template <typename SegmentTypes,
-          typename OffsetTypes>
+template<typename SegmentTypes, typename OffsetTypes>
 struct LoopTypes;
 
-template <typename ... SegmentTypes,
-          typename ... OffsetTypes>
-struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>> {
+template<typename... SegmentTypes, typename... OffsetTypes>
+struct LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>
+{
 
-  using Self = LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
+  using Self =
+      LoopTypes<camp::list<SegmentTypes...>, camp::list<OffsetTypes...>>;
 
   static constexpr size_t s_num_segments = sizeof...(SegmentTypes);
 
   // This ensures that you don't double-loop over a segment within the same
   // loop nesting
   static_assert(s_num_segments == sizeof...(OffsetTypes),
-      "Number of segments and offsets must match");
+                "Number of segments and offsets must match");
 
   using segment_types_t = camp::list<SegmentTypes...>;
-  using offset_types_t = camp::list<OffsetTypes...>;
+  using offset_types_t  = camp::list<OffsetTypes...>;
 };
 
-
 template<typename Data>
-using makeInitialLoopTypes =
-    LoopTypes<list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
-              list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
+using makeInitialLoopTypes = LoopTypes<
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>,
+    list_of_n<void, camp::tuple_size<typename Data::segment_tuple_t>::value>>;
 
 
 template<typename Types, camp::idx_t Segment, typename T, typename Seq>
 struct SetSegmentTypeHelper;
 
-template<typename Types,
-         camp::idx_t Segment,
-         typename T,
-         camp::idx_t ... SEQ>
+template<typename Types, camp::idx_t Segment, typename T, camp::idx_t... SEQ>
 struct SetSegmentTypeHelper<Types, Segment, T, camp::idx_seq<SEQ...>>
 {
-    using segment_list = typename Types::segment_types_t;
-    using offset_list = typename Types::offset_types_t;
-
-    static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
-        "Segment was already assigned: Probably looping over same segment in loop nest");
-
-    using type = LoopTypes<
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>,
-        camp::list<typename std::conditional<SEQ == Segment, T, camp::at_v<segment_list, SEQ>>::type...>>;
-
+  using segment_list = typename Types::segment_types_t;
+  using offset_list  = typename Types::offset_types_t;
+
+  static_assert(std::is_same<camp::at_v<segment_list, Segment>, void>::value,
+                "Segment was already assigned: Probably looping over same "
+                "segment in loop nest");
+
+  using type = LoopTypes<
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>,
+      camp::list<
+          typename std::conditional<SEQ == Segment,
+                                    T,
+                                    camp::at_v<segment_list, SEQ>>::type...>>;
 };
 
-
 template<typename Types, camp::idx_t Segment, typename T>
-using setSegmentType =
-    typename SetSegmentTypeHelper<Types, Segment, T, camp::make_idx_seq_t<Types::s_num_segments>>::type;
+using setSegmentType = typename SetSegmentTypeHelper<
+    Types,
+    Segment,
+    T,
+    camp::make_idx_seq_t<Types::s_num_segments>>::type;
 
 template<typename Types, camp::idx_t Segment, typename Data>
-using setSegmentTypeFromData =
-    setSegmentType<Types, Segment, camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
+using setSegmentTypeFromData = setSegmentType<
+    Types,
+    Segment,
+    camp::at_v<typename camp::decay<Data>::index_types_t, Segment>>;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/kernel/internal/Statement.hpp b/include/RAJA/pattern/kernel/internal/Statement.hpp
index 48ca828a68..6748948ce0 100644
--- a/include/RAJA/pattern/kernel/internal/Statement.hpp
+++ b/include/RAJA/pattern/kernel/internal/Statement.hpp
@@ -28,25 +28,24 @@ namespace internal
 {
 
 
-
-template <typename ExecPolicy, typename... EnclosedStmts>
-struct Statement {
-  static_assert(std::is_same<ExecPolicy, camp::nil>::value || sizeof...(EnclosedStmts) > 0,
-      "Executable statement with no enclosed statements, this is almost certainly a bug");
+template<typename ExecPolicy, typename... EnclosedStmts>
+struct Statement
+{
+  static_assert(std::is_same<ExecPolicy, camp::nil>::value ||
+                    sizeof...(EnclosedStmts) > 0,
+                "Executable statement with no enclosed statements, this is "
+                "almost certainly a bug");
   Statement() = delete;
 
   using enclosed_statements_t = StatementList<EnclosedStmts...>;
-  using execution_policy_t = ExecPolicy;
+  using execution_policy_t    = ExecPolicy;
 };
 
 
-
-
-template <typename Policy, typename Types>
+template<typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/StatementList.hpp b/include/RAJA/pattern/kernel/internal/StatementList.hpp
index 5c0d71afb4..c9e005ca1e 100644
--- a/include/RAJA/pattern/kernel/internal/StatementList.hpp
+++ b/include/RAJA/pattern/kernel/internal/StatementList.hpp
@@ -31,27 +31,26 @@ namespace internal
 
 
 // forward decl
-template <typename Policy, typename Types>
+template<typename Policy, typename Types>
 struct StatementExecutor;
 
 
-
-
-template <typename... Stmts>
+template<typename... Stmts>
 using StatementList = camp::list<Stmts...>;
 
 
-template <camp::idx_t idx, camp::idx_t N, typename StmtList, typename Types>
+template<camp::idx_t idx, camp::idx_t N, typename StmtList, typename Types>
 struct StatementListExecutor;
 
+template<camp::idx_t statement_index,
+         camp::idx_t num_statements,
+         typename StmtList,
+         typename Types>
+struct StatementListExecutor
+{
 
-template <camp::idx_t statement_index,
-          camp::idx_t num_statements,
-          typename StmtList, typename Types>
-struct StatementListExecutor {
-
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Get the statement we're going to execute
@@ -61,35 +60,32 @@ struct StatementListExecutor {
     StatementExecutor<statement, Types>::exec(std::forward<Data>(data));
 
     // call our next statement
-    StatementListExecutor<statement_index + 1, num_statements, StmtList, Types>::exec(
-        std::forward<Data>(data));
+    StatementListExecutor<statement_index + 1, num_statements, StmtList,
+                          Types>::exec(std::forward<Data>(data));
   }
 };
 
-
 /*
  * termination case, a NOP.
  */
 
-template <camp::idx_t num_statements, typename StmtList, typename Types>
-struct StatementListExecutor<num_statements, num_statements, StmtList, Types> {
+template<camp::idx_t num_statements, typename StmtList, typename Types>
+struct StatementListExecutor<num_statements, num_statements, StmtList, Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&)
-  {
-  }
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&&)
+  {}
 };
 
-
-template <typename StmtList, typename Types, typename Data>
-RAJA_INLINE void execute_statement_list(Data &&data)
+template<typename StmtList, typename Types, typename Data>
+RAJA_INLINE void execute_statement_list(Data&& data)
 {
   StatementListExecutor<0, camp::size<StmtList>::value, StmtList, Types>::exec(
       std::forward<Data>(data));
 }
 
 
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/pattern/kernel/internal/Template.hpp b/include/RAJA/pattern/kernel/internal/Template.hpp
index c750b95986..59ba6b8fbf 100644
--- a/include/RAJA/pattern/kernel/internal/Template.hpp
+++ b/include/RAJA/pattern/kernel/internal/Template.hpp
@@ -20,7 +20,6 @@
 
 #include "camp/camp.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -30,32 +29,32 @@ namespace detail
 {
 // Helper class to convert a camp::idx_t into some type T
 // used in template expansion in ListOfNHelper
-template <typename T, camp::idx_t>
+template<typename T, camp::idx_t>
 struct SeqToType
 {
   using type = T;
 };
 
-template <typename T, typename SEQ>
+template<typename T, typename SEQ>
 struct ListOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct ListOfNHelper<T, camp::idx_seq<SEQ...> >
+template<typename T, camp::idx_t... SEQ>
+struct ListOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::list<typename SeqToType<T, SEQ>::type...>;
 };
 
 
-template <typename T, typename SEQ>
+template<typename T, typename SEQ>
 struct TupleOfNHelper;
 
-template <typename T, camp::idx_t ... SEQ>
-struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
+template<typename T, camp::idx_t... SEQ>
+struct TupleOfNHelper<T, camp::idx_seq<SEQ...>>
 {
   using type = camp::tuple<typename SeqToType<T, SEQ>::type...>;
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*
  *  This creates a camp::list with N types, each one being T.
@@ -63,8 +62,9 @@ struct TupleOfNHelper<T, camp::idx_seq<SEQ...> >
  *  That is, list_of_n<T, 4>  ==  camp::list<T, T, T, T>
  *
  */
-template <typename T, camp::idx_t N>
-using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
+template<typename T, camp::idx_t N>
+using list_of_n =
+    typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 /*
@@ -73,9 +73,9 @@ using list_of_n = typename detail::ListOfNHelper<T, camp::make_idx_seq_t<N>>::ty
  *  That is, tuple_of_n<T, 4>  ==  camp::tuple<T, T, T, T>
  *
  */
-template <typename T, camp::idx_t N>
-using tuple_of_n = typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
-
+template<typename T, camp::idx_t N>
+using tuple_of_n =
+    typename detail::TupleOfNHelper<T, camp::make_idx_seq_t<N>>::type;
 
 
 }  // end namespace internal
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
index f1d70aeacb..a05629dcd0 100644
--- a/include/RAJA/pattern/launch/launch_core.hpp
+++ b/include/RAJA/pattern/launch/launch_core.hpp
@@ -28,8 +28,8 @@
 #include "camp/concepts.hpp"
 #include "camp/tuple.hpp"
 
-//Odd dependecy with atomics is breaking CI builds
-//#include "RAJA/util/View.hpp"
+// Odd dependecy with atomics is breaking CI builds
+// #include "RAJA/util/View.hpp"
 
 #if defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE) && !defined(RAJA_ENABLE_SYCL)
 #define RAJA_TEAM_SHARED __shared__
@@ -41,95 +41,114 @@ namespace RAJA
 {
 
 // GPU or CPU threads available
-//strongly type the ExecPlace (guards agaist errors)
-enum struct ExecPlace : int { HOST, DEVICE, NUM_PLACES };
-
-struct null_launch_t {
+// strongly type the ExecPlace (guards agaist errors)
+enum struct ExecPlace : int
+{
+  HOST,
+  DEVICE,
+  NUM_PLACES
 };
 
+struct null_launch_t
+{};
+
 // Support for host, and device
-template <typename HOST_POLICY
+template<typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-          ,
-          typename DEVICE_POLICY = HOST_POLICY
+         ,
+         typename DEVICE_POLICY = HOST_POLICY
 #endif
-          >
+         >
 
-struct LoopPolicy {
+struct LoopPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
 #endif
 };
 
-template <typename HOST_POLICY
+template<typename HOST_POLICY
 #if defined(RAJA_GPU_ACTIVE)
-          ,
-          typename DEVICE_POLICY = HOST_POLICY
+         ,
+         typename DEVICE_POLICY = HOST_POLICY
 #endif
-          >
-struct LaunchPolicy {
+         >
+struct LaunchPolicy
+{
   using host_policy_t = HOST_POLICY;
 #if defined(RAJA_GPU_ACTIVE)
   using device_policy_t = DEVICE_POLICY;
 #endif
 };
 
-
-struct Teams {
+struct Teams
+{
   int value[3];
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Teams() : value{1, 1, 1} {}
+  constexpr Teams() : value {1, 1, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Teams(int i) : value{i, 1, 1} {}
+  constexpr Teams(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j) : value{i, j, 1} {}
+  constexpr Teams(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Teams(int i, int j, int k) : value{i, j, k} {}
+  constexpr Teams(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Threads {
+struct Threads
+{
   int value[3];
 
   RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr Threads() : value{1, 1, 1} {}
 
+  RAJA_HOST_DEVICE
+  constexpr Threads() : value {1, 1, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Threads(int i) : value{i, 1, 1} {}
+  constexpr Threads(int i) : value {i, 1, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j) : value{i, j, 1} {}
+  constexpr Threads(int i, int j) : value {i, j, 1} {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr Threads(int i, int j, int k) : value{i, j, k} {}
+  constexpr Threads(int i, int j, int k) : value {i, j, k} {}
 };
 
-struct Lanes {
+struct Lanes
+{
   int value;
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   constexpr Lanes() : value(0) {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
   constexpr Lanes(int i) : value(i) {}
 };
 
-struct LaunchParams {
+struct LaunchParams
+{
 public:
   Teams teams;
   Threads threads;
@@ -138,67 +157,74 @@ struct LaunchParams {
   RAJA_INLINE
   LaunchParams() = default;
 
-  LaunchParams(Teams in_teams, Threads in_threads, size_t in_shared_mem_size = 0)
-    : teams(in_teams), threads(in_threads), shared_mem_size(in_shared_mem_size) {};
+  LaunchParams(Teams in_teams,
+               Threads in_threads,
+               size_t in_shared_mem_size = 0)
+      : teams(in_teams),
+        threads(in_threads),
+        shared_mem_size(in_shared_mem_size) {};
 
 private:
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  Teams apply(Teams const &a) { return (teams = a); }
+  Teams apply(Teams const& a) { return (teams = a); }
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  Threads apply(Threads const &a) { return (threads = a); }
+  Threads apply(Threads const& a) { return (threads = a); }
 };
 
 class LaunchContext
 {
 public:
-
-  //Bump style allocator used to
-  //get memory from the pool
+  // Bump style allocator used to
+  // get memory from the pool
   size_t shared_mem_offset;
 
-  void *shared_mem_ptr;
+  void* shared_mem_ptr;
 
 #if defined(RAJA_ENABLE_SYCL)
-  mutable cl::sycl::nd_item<3> *itm;
+  mutable cl::sycl::nd_item<3>* itm;
 #endif
 
   RAJA_HOST_DEVICE LaunchContext()
-    : shared_mem_offset(0), shared_mem_ptr(nullptr)
-  {
-  }
+      : shared_mem_offset(0),
+        shared_mem_ptr(nullptr)
+  {}
 
-  //TODO handle alignment
+  // TODO handle alignment
   template<typename T>
   RAJA_HOST_DEVICE T* getSharedMemory(size_t bytes)
   {
 
-    //Calculate offset in bytes with a char pointer
-    void* mem_ptr = static_cast<char *>(shared_mem_ptr) + shared_mem_offset;
+    // Calculate offset in bytes with a char pointer
+    void* mem_ptr = static_cast<char*>(shared_mem_ptr) + shared_mem_offset;
 
-    shared_mem_offset += bytes*sizeof(T);
+    shared_mem_offset += bytes * sizeof(T);
 
-    //convert to desired type
+    // convert to desired type
     return static_cast<T*>(mem_ptr);
   }
 
   /*
   //Odd dependecy with atomics is breaking CI builds
-  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t z_stride=DIM-1, typename arg, typename... args>
-  RAJA_HOST_DEVICE auto getSharedMemoryView(size_t bytes, arg idx, args... idxs)
+  template<typename T, size_t DIM, typename IDX_T=RAJA::Index_type, ptrdiff_t
+  z_stride=DIM-1, typename arg, typename... args> RAJA_HOST_DEVICE auto
+  getSharedMemoryView(size_t bytes, arg idx, args... idxs)
   {
     T * mem_ptr = &((T*) shared_mem_ptr)[shared_mem_offset];
 
     shared_mem_offset += bytes*sizeof(T);
-    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx, idxs...);
+    return RAJA::View<T, RAJA::Layout<DIM, IDX_T, z_stride>>(mem_ptr, idx,
+  idxs...);
   }
   */
 
   RAJA_HOST_DEVICE void releaseSharedMemory()
   {
-    //On the cpu/gpu we want to restart the count
+    // On the cpu/gpu we want to restart the count
     shared_mem_offset = 0;
   }
 
@@ -215,22 +241,27 @@ class LaunchContext
   }
 };
 
-template <typename LAUNCH_POLICY>
+template<typename LAUNCH_POLICY>
 struct LaunchExecute;
 
-//Policy based launch with support to new reducers...
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Policy based launch with support to new reducers...
+template<typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -242,29 +273,35 @@ void launch(LaunchParams const &launch_params, const char *kernel_name, ReducePa
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
-
-//Duplicate of code above on account that we need to support the case in which a kernel_name is not given
-template <typename LAUNCH_POLICY, typename ... ReduceParams>
-void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_args)
+// Duplicate of code above on account that we need to support the case in which
+// a kernel_name is not given
+template<typename LAUNCH_POLICY, typename... ReduceParams>
+void launch(LaunchParams const& launch_params,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  //Take the first policy as we assume the second policy is not user defined.
-  //We rely on the user to pair launch and loop policies correctly.
-  util::PluginContext context{util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
+  // Take the first policy as we assume the second policy is not user defined.
+  // We rely on the user to pair launch and loop policies correctly.
+  util::PluginContext context {
+      util::make_context<typename LAUNCH_POLICY::host_policy_t>()};
   util::callPreCapturePlugins(context);
 
   using RAJA::util::trigger_updates_before;
@@ -276,148 +313,201 @@ void launch(LaunchParams const &launch_params, ReduceParams&&... rest_of_launch_
 
   using launch_t = LaunchExecute<typename LAUNCH_POLICY::host_policy_t>;
 
-  using Res = typename resources::get_resource<typename LAUNCH_POLICY::host_policy_t>::type;
+  using Res = typename resources::get_resource<
+      typename LAUNCH_POLICY::host_policy_t>::type;
 
-  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body, reducers);
+  launch_t::exec(Res::get_default(), launch_params, kernel_name, p_body,
+                 reducers);
 
   util::callPostLaunchPlugins(context);
 }
 
 //=================================================
-//Run time based policy launch
+// Run time based policy launch
 //=================================================
-template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, LaunchParams const &params, BODY const &body)
+template<typename POLICY_LIST, typename BODY>
+void launch(ExecPlace place, LaunchParams const& params, BODY const& body)
 {
   launch<POLICY_LIST>(place, params, nullptr, body);
 }
 
-template <typename POLICY_LIST, typename BODY>
-void launch(ExecPlace place, const LaunchParams &params, const char *kernel_name, BODY const &body)
+template<typename POLICY_LIST, typename BODY>
+void launch(ExecPlace place,
+            const LaunchParams& params,
+            const char* kernel_name,
+            BODY const& body)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(Res::get_default(), params, kernel_name, body);
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+    case ExecPlace::HOST:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(), params, kernel_name, body);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(Res::get_default(), params, kernel_name, body);
+    case ExecPlace::DEVICE:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(), params, kernel_name, body);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface
-template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Run-time API for new reducer interface
+template<typename POLICY_LIST, typename... ReduceParams>
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            const char* kernel_name,
+            ReduceParams&&... rest_of_launch_args)
 {
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+    case ExecPlace::HOST:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(), launch_params, kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+    case ExecPlace::DEVICE:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(), launch_params, kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
-//Run-time API for new reducer interface with support of the case without a new kernel name
-template <typename POLICY_LIST, typename... ReduceParams>
-void launch(ExecPlace place, const LaunchParams &launch_params, ReduceParams&&... rest_of_launch_args)
-            //BODY const &body)
+// Run-time API for new reducer interface with support of the case without a new
+// kernel name
+template<typename POLICY_LIST, typename... ReduceParams>
+void launch(ExecPlace place,
+            const LaunchParams& launch_params,
+            ReduceParams&&... rest_of_launch_args)
+// BODY const &body)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Forward to single policy launch API - simplifies testing of plugins
-  switch (place) {
-    case ExecPlace::HOST: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::host_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Forward to single policy launch API - simplifies testing of plugins
+  switch (place)
+  {
+    case ExecPlace::HOST:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::host_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::host_policy_t>>(
+          Res::get_default(), launch_params, kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #if defined(RAJA_GPU_ACTIVE)
-  case ExecPlace::DEVICE: {
-      using Res = typename resources::get_resource<typename POLICY_LIST::device_policy_t>::type;
-      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>
-        (Res::get_default(), launch_params, kernel_name, std::forward<ReduceParams>(rest_of_launch_args)...);
+    case ExecPlace::DEVICE:
+    {
+      using Res = typename resources::get_resource<
+          typename POLICY_LIST::device_policy_t>::type;
+      launch<LaunchPolicy<typename POLICY_LIST::device_policy_t>>(
+          Res::get_default(), launch_params, kernel_name,
+          std::forward<ReduceParams>(rest_of_launch_args)...);
       break;
     }
 #endif
     default:
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
   }
-
 }
 
 
-// Helper function to retrieve a resource based on the run-time policy - if a device is active
-#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) || defined(RAJA_ENABLE_SYCL)
+// Helper function to retrieve a resource based on the run-time policy - if a
+// device is active
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP) ||                   \
+    defined(RAJA_ENABLE_SYCL)
 template<typename T, typename U>
-RAJA::resources::Resource Get_Runtime_Resource(T host_res, U device_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {return RAJA::resources::Resource(device_res);}
-  else { return RAJA::resources::Resource(host_res); }
+RAJA::resources::Resource Get_Runtime_Resource(T host_res,
+                                               U device_res,
+                                               RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    return RAJA::resources::Resource(device_res);
+  }
+  else
+  {
+    return RAJA::resources::Resource(host_res);
+  }
 }
 #endif
 
 template<typename T>
-RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device){
-  if(device == RAJA::ExecPlace::DEVICE) {RAJA_ABORT_OR_THROW("Device is not enabled");}
+RAJA::resources::Resource Get_Host_Resource(T host_res, RAJA::ExecPlace device)
+{
+  if (device == RAJA::ExecPlace::DEVICE)
+  {
+    RAJA_ABORT_OR_THROW("Device is not enabled");
+  }
 
   return RAJA::resources::Resource(host_res);
 }
 
-//Launch API which takes team resource struct and supports new reducers
-template <typename POLICY_LIST, typename ... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *kernel_name, ReduceParams&&... rest_of_launch_args)
+// Launch API which takes team resource struct and supports new reducers
+template<typename POLICY_LIST, typename... ReduceParams>
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const& launch_params,
+    const char* kernel_name,
+    ReduceParams&&... rest_of_launch_args)
 {
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -429,22 +519,28 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
+  switch (place)
+  {
+    case ExecPlace::HOST:
+    {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
+    case ExecPlace::DEVICE:
+    {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name,  p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #endif
-    default: {
+    default:
+    {
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
     }
   }
@@ -455,37 +551,45 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
   return resources::EventProxy<resources::Resource>(res);
 }
 
-
-//Duplicate of API above on account that we need to handle the case that a kernel name is not provided
-template <typename POLICY_LIST, typename ... ReduceParams>
-resources::EventProxy<resources::Resource>
-launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       ReduceParams&&... rest_of_launch_args)
+// Duplicate of API above on account that we need to handle the case that a
+// kernel name is not provided
+template<typename POLICY_LIST, typename... ReduceParams>
+resources::EventProxy<resources::Resource> launch(
+    RAJA::resources::Resource res,
+    LaunchParams const& launch_params,
+    ReduceParams&&... rest_of_launch_args)
 {
 
-  const char *kernel_name = nullptr;
+  const char* kernel_name = nullptr;
 
-  //Get reducers
-  auto reducers = expt::make_forall_param_pack(std::forward<ReduceParams>(rest_of_launch_args)...);
+  // Get reducers
+  auto reducers = expt::make_forall_param_pack(
+      std::forward<ReduceParams>(rest_of_launch_args)...);
 
-  auto&& launch_body = expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
+  auto&& launch_body =
+      expt::get_lambda(std::forward<ReduceParams>(rest_of_launch_args)...);
 
   ExecPlace place;
-  if(res.get_platform() == RAJA::Platform::host) {
+  if (res.get_platform() == RAJA::Platform::host)
+  {
     place = RAJA::ExecPlace::HOST;
-  } else {
+  }
+  else
+  {
     place = RAJA::ExecPlace::DEVICE;
   }
 
   //
-  //Configure plugins
+  // Configure plugins
   //
 #if defined(RAJA_GPU_ACTIVE)
-  util::PluginContext context{place == ExecPlace::HOST ?
-      util::make_context<typename POLICY_LIST::host_policy_t>() :
-      util::make_context<typename POLICY_LIST::device_policy_t>()};
+  util::PluginContext context {
+      place == ExecPlace::HOST
+          ? util::make_context<typename POLICY_LIST::host_policy_t>()
+          : util::make_context<typename POLICY_LIST::device_policy_t>()};
 #else
-  util::PluginContext context{util::make_context<typename POLICY_LIST::host_policy_t>()};
+  util::PluginContext context {
+      util::make_context<typename POLICY_LIST::host_policy_t>()};
 #endif
 
   util::callPreCapturePlugins(context);
@@ -497,22 +601,28 @@ launch(RAJA::resources::Resource res, LaunchParams const &launch_params,
 
   util::callPreLaunchPlugins(context);
 
-  switch (place) {
-    case ExecPlace::HOST: {
+  switch (place)
+  {
+    case ExecPlace::HOST:
+    {
       using launch_t = LaunchExecute<typename POLICY_LIST::host_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #if defined(RAJA_GPU_ACTIVE)
-    case ExecPlace::DEVICE: {
+    case ExecPlace::DEVICE:
+    {
       using launch_t = LaunchExecute<typename POLICY_LIST::device_policy_t>;
-      resources::EventProxy<resources::Resource> e_proxy = launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
+      resources::EventProxy<resources::Resource> e_proxy =
+          launch_t::exec(res, launch_params, kernel_name, p_body, reducers);
       util::callPostLaunchPlugins(context);
       return e_proxy;
     }
 #endif
-    default: {
+    default:
+    {
       RAJA_ABORT_OR_THROW("Unknown launch place or device is not enabled");
     }
   }
@@ -530,183 +640,163 @@ using loop_policy = typename POLICY_LIST::device_policy_t;
 using loop_policy = typename POLICY_LIST::host_policy_t;
 #endif
 
-template <typename POLICY, typename SEGMENT>
+template<typename POLICY, typename SEGMENT>
 struct LoopExecute;
 
-template <typename POLICY, typename SEGMENT>
+template<typename POLICY, typename SEGMENT>
 struct LoopICountExecute;
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment, body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                          SEGMENT const &segment,
-                                          BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          segment,
-                                                          body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment,
+                                                             body);
 }
 
 namespace expt
 {
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
                                                        body);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop(CONTEXT const& ctx,
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       SEGMENT const& segment2,
+                                       BODY const& body)
 {
 
-  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       segment0,
-                                                       segment1,
-                                                       segment2,
-                                                       body);
+  LoopExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, segment0, segment1,
+                                                       segment2, body);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const &ctx,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       SEGMENT const &segment2,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void loop_icount(CONTEXT const& ctx,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              SEGMENT const& segment2,
+                                              BODY const& body)
 {
 
-  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                           segment0, segment1, segment2, body);
+  LoopICountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, segment0, segment1, segment2, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
-template <typename POLICY, typename SEGMENT>
+template<typename POLICY, typename SEGMENT>
 struct TileExecute;
 
-template <typename POLICY, typename SEGMENT>
+template<typename POLICY, typename SEGMENT>
 struct TileTCountExecute;
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename TILE_T,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+                                       SEGMENT const& segment,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size,
-                                                       segment,
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size, segment,
                                                        body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size,
-                                       SEGMENT const &segment,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename TILE_T,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size,
+                                              SEGMENT const& segment,
+                                              BODY const& body)
 {
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size,
-                                                          segment,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx, tile_size,
+                                                             segment, body);
 }
 
 namespace expt
 {
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const &ctx,
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename TILE_T,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile(CONTEXT const& ctx,
                                        TILE_T tile_size0,
                                        TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+                                       SEGMENT const& segment0,
+                                       SEGMENT const& segment1,
+                                       BODY const& body)
 {
 
-  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                       tile_size0,
-                                                       tile_size1,
-                                                       segment0,
-                                                       segment1,
-                                                       body);
+  TileExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-template <typename POLICY_LIST,
-          typename CONTEXT,
-          typename TILE_T,
-          typename SEGMENT,
-          typename BODY>
-RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const &ctx,
-                                       TILE_T tile_size0,
-                                       TILE_T tile_size1,
-                                       SEGMENT const &segment0,
-                                       SEGMENT const &segment1,
-                                       BODY const &body)
+template<typename POLICY_LIST,
+         typename CONTEXT,
+         typename TILE_T,
+         typename SEGMENT,
+         typename BODY>
+RAJA_HOST_DEVICE RAJA_INLINE void tile_tcount(CONTEXT const& ctx,
+                                              TILE_T tile_size0,
+                                              TILE_T tile_size1,
+                                              SEGMENT const& segment0,
+                                              SEGMENT const& segment1,
+                                              BODY const& body)
 {
 
-  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(ctx,
-                                                          tile_size0,
-                                                          tile_size1,
-                                                          segment0,
-                                                          segment1,
-                                                          body);
+  TileTCountExecute<loop_policy<POLICY_LIST>, SEGMENT>::exec(
+      ctx, tile_size0, tile_size1, segment0, segment1, body);
 }
 
-} //namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 #endif
diff --git a/include/RAJA/pattern/multi_reduce.hpp b/include/RAJA/pattern/multi_reduce.hpp
index 3fbe36877c..8a1c37db2b 100644
--- a/include/RAJA/pattern/multi_reduce.hpp
+++ b/include/RAJA/pattern/multi_reduce.hpp
@@ -64,7 +64,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename MULTI_REDUCE_POLICY_T, typename T>
+template<typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceMin;
 
 /*!
@@ -94,7 +94,7 @@ struct MultiReduceMin;
  *
  ******************************************************************************
  */
-template <typename MULTI_REDUCE_POLICY_T, typename T>
+template<typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceMax;
 
 /*!
@@ -124,7 +124,7 @@ struct MultiReduceMax;
  *
  ******************************************************************************
  */
-template <typename MULTI_REDUCE_POLICY_T, typename T>
+template<typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceSum;
 
 /*!
@@ -154,9 +154,9 @@ struct MultiReduceSum;
  *
  ******************************************************************************
  */
-template <typename MULTI_REDUCE_POLICY_T, typename T>
+template<typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -171,7 +171,8 @@ struct MultiReduceBitOr;
    Index_ptr bins = ...;
    Real_ptr bit_vals = ...;
 
-   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins, init_val);
+   MultiReduceBitAnd<multi_reduce_policy, Real_type> my_bits(num_bins,
+ init_val);
 
    forall<exec_policy>( ..., [=] (Index_type i) {
       my_bits[bins[i]] &= (data[i]);
@@ -185,10 +186,10 @@ struct MultiReduceBitOr;
  *
  ******************************************************************************
  */
-template <typename MULTI_REDUCE_POLICY_T, typename T>
+template<typename MULTI_REDUCE_POLICY_T, typename T>
 struct MultiReduceBitAnd;
 
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/params/forall.hpp b/include/RAJA/pattern/params/forall.hpp
index 5a656206f5..c98e4429a2 100644
--- a/include/RAJA/pattern/params/forall.hpp
+++ b/include/RAJA/pattern/params/forall.hpp
@@ -21,348 +21,450 @@ namespace RAJA
 namespace expt
 {
 
-  //
-  //
-  // Forall Parameter Packing type
-  //
-  //
-  struct ParamMultiplexer;
-
-  template<typename... Params>
-  struct ForallParamPack {
-
-    friend struct ParamMultiplexer;
-
-    using Base = camp::tuple<Params...>;
-    Base param_tup;
-
-    static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value; 
-    using params_seq = camp::make_idx_seq_t< param_tup_sz >;
-
-  private:
-
-    // Init
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_init(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(expt::detail::init<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Combine
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& out, const ForallParamPack& in ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(out.param_tup), camp::get<Seq>(in.param_tup)));
-    }
-
-    template<typename EXEC_POL, camp::idx_t... Seq>
-    RAJA_HOST_DEVICE
-    static constexpr void detail_combine(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params ) {
-      CAMP_EXPAND(detail::combine<EXEC_POL>( camp::get<Seq>(f_params.param_tup) ));
-    }
-    
-    // Resolve
-    template<typename EXEC_POL, camp::idx_t... Seq, typename ...Args>
-    static constexpr void detail_resolve(EXEC_POL, camp::idx_seq<Seq...>, ForallParamPack& f_params, Args&& ...args) {
-      CAMP_EXPAND(detail::resolve<EXEC_POL>( camp::get<Seq>(f_params.param_tup), std::forward<Args>(args)... ));
-    }
-
-    // Used to construct the argument TYPES that will be invoked with the lambda.
-    template<typename null_t = camp::nil>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple<>{}; };
-    template<typename null_t = camp::nil, typename First>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return typename First::ARG_TUP_T(); };
-    template<typename null_t = camp::nil, typename First, typename Second, typename... Rest>
-    static constexpr auto LAMBDA_ARG_TUP_T() { return camp::tuple_cat_pair(typename First::ARG_TUP_T(), LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>()); };
-
-    using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
-    
-    //Use the size of param_tup to generate the argument list.
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>) { return camp::make_tuple(); }
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>) { return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup(); }
-    template<camp::idx_t N>
-    RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>) {
-      return camp::tuple_cat_pair(  camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(), LAMBDA_ARG_TUP_V(camp::num<N-1>())  );
-    }
-
-  public:
-    ForallParamPack(){}
-
-    RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args() {return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());}
-
-    using lambda_arg_seq = camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
-
-    template<typename... Ts>
-    ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
-  }; // struct ForallParamPack 
-  
-
-
-  //===========================================================================
-  //
-  //
-  // ParamMultiplexer is how we hook into the individual calls within forall_impl.
-  //
-  //
-  struct ParamMultiplexer {
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr init( ForallParamPack<Params...>& f_params, Args&& ...args) {
-      FP::detail_init(EXEC_POL(),typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr combine(ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-    template<typename EXEC_POL, typename... Params, typename ...Args, typename FP = ForallParamPack<Params...>>
-    static void constexpr resolve( ForallParamPack<Params...>& f_params, Args&& ...args){
-      FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params, std::forward<Args>(args)... );
-    }
-  };
-  //===========================================================================
+//
+//
+// Forall Parameter Packing type
+//
+//
+struct ParamMultiplexer;
+
+template<typename... Params>
+struct ForallParamPack
+{
 
+  friend struct ParamMultiplexer;
 
+  using Base = camp::tuple<Params...>;
+  Base param_tup;
 
-  //===========================================================================
-  //
-  //
-  // ForallParamPack generators.
-  //
-  //
-  RAJA_INLINE static auto get_empty_forall_param_pack(){
-    static ForallParamPack<> p;
-    return p;
-  }
+  static constexpr size_t param_tup_sz = camp::tuple_size<Base>::value;
+  using params_seq                     = camp::make_idx_seq_t<param_tup_sz>;
 
-  namespace detail {
-    // all_true trick to perform variadic expansion in static asserts.
-    // https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
-    template<bool...> struct bool_pack;
-    template<bool... bs>
-    using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+private:
+  // Init
+  template<typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_init(EXEC_POL,
+                                    camp::idx_seq<Seq...>,
+                                    ForallParamPack& f_params,
+                                    Args&&... args)
+  {
+    CAMP_EXPAND(expt::detail::init<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                             std::forward<Args>(args)...));
+  }
 
-    template<typename Base, typename... Ts>
-    using check_types_derive_base = all_true<std::is_convertible<Ts, Base>::value...>;
-  } // namespace detail
+  // Combine
+  template<typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void detail_combine(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& out,
+      const ForallParamPack& in)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(out.param_tup),
+                                          camp::get<Seq>(in.param_tup)));
+  }
 
+  template<typename EXEC_POL, camp::idx_t... Seq>
+  RAJA_HOST_DEVICE static constexpr void detail_combine(
+      EXEC_POL,
+      camp::idx_seq<Seq...>,
+      ForallParamPack& f_params)
+  {
+    CAMP_EXPAND(detail::combine<EXEC_POL>(camp::get<Seq>(f_params.param_tup)));
+  }
 
-  template<typename... Ts>
-  constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple) {
-    static_assert(detail::check_types_derive_base<detail::ForallParamBase, camp::decay<Ts>...>::value,
-        "Forall optional arguments do not derive ForallParamBase. Please see Reducer, ReducerLoc and KernelName for examples.") ;
-    return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+  // Resolve
+  template<typename EXEC_POL, camp::idx_t... Seq, typename... Args>
+  static constexpr void detail_resolve(EXEC_POL,
+                                       camp::idx_seq<Seq...>,
+                                       ForallParamPack& f_params,
+                                       Args&&... args)
+  {
+    CAMP_EXPAND(detail::resolve<EXEC_POL>(camp::get<Seq>(f_params.param_tup),
+                                          std::forward<Args>(args)...));
   }
 
-  
+  // Used to construct the argument TYPES that will be invoked with the lambda.
+  template<typename null_t = camp::nil>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple<> {};
+  };
 
-  namespace detail {
-    // Maybe we should do a lot of these with structs...
-    template<camp::idx_t... Seq, typename TupleType>
-    constexpr auto tuple_from_seq (const camp::idx_seq<Seq...>&, TupleType&& tuple){
-      return camp::forward_as_tuple( camp::get< Seq >(std::forward<TupleType>(tuple))... );
-    };
+  template<typename null_t = camp::nil, typename First>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return typename First::ARG_TUP_T();
+  };
 
-    template<typename... Ts>
-    constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple){
-      return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts)-1>{},std::move(tuple));
-    };
-  } // namespace detail
+  template<typename null_t = camp::nil,
+           typename First,
+           typename Second,
+           typename... Rest>
+  static constexpr auto LAMBDA_ARG_TUP_T()
+  {
+    return camp::tuple_cat_pair(typename First::ARG_TUP_T(),
+                                LAMBDA_ARG_TUP_T<camp::nil, Second, Rest...>());
+  };
 
+  using lambda_arg_tuple_t = decltype(LAMBDA_ARG_TUP_T<camp::nil, Params...>());
 
-  // Make a tuple of the param pack except the final element...
-  template<typename... Args>
-  constexpr auto make_forall_param_pack(Args&&... args){
-    // We assume the last element of the pack is the lambda so we need to strip it from the list.
-    auto stripped_arg_tuple = detail::strip_last_elem( camp::forward_as_tuple(std::forward<Args>(args)...) ); 
-    return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+  // Use the size of param_tup to generate the argument list.
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<0>)
+  {
+    return camp::make_tuple();
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Callable should be the last argument in the param pack, just extract it...
-  //
-  //
-  template<typename... Args>
-  constexpr auto&& get_lambda(Args&&... args){
-    return camp::get<sizeof...(Args)-1>( camp::forward_as_tuple(std::forward<Args>(args)...) );
-  } 
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Checking expected argument list against the assumed lambda.
-  //
-  //
-  namespace detail {
-
-    // 
-    //
-    // Lambda traits Utilities
-    // 
-    //
-    template<class F>
-    struct lambda_traits;
-
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...)>
-    {  // non-const specialization
-      using arg_type = First; 
-    };
-    template<class R, class C, class First, class... Rest>
-    struct lambda_traits<R (C::*)(First, Rest...) const>
-    {  // const specialization
-      using arg_type = First; 
-    };
-
-    template<class T>
-    typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
-
-
-    // 
-    //
-    // List manipulation Utilities
-    // 
-    //
-    template<typename... Ts>
-    constexpr auto list_remove_pointer(const camp::list<Ts...>&){
-      return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...>{};
-    }
-    
-    template<typename... Ts>
-    constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&){
-      return camp::list<typename std::add_lvalue_reference<Ts>::type...>{};
-    }
-
-    template<typename... Ts>
-    constexpr auto tuple_to_list(const camp::tuple<Ts...>&) {
-      return camp::list<Ts...>{};
-    }
-
-    // TODO : Change to std::is_invocable at c++17
-    template <typename F, typename... Args>
-    struct is_invocable :
-      std::is_constructible<
-        std::function<void(Args ...)>,
-        std::reference_wrapper<typename std::remove_reference<F>::type>
-      >{};
-
-    template<class...>
-    using void_t = void;
-
-    template<class F, class=void>
-    struct has_empty_op : std::false_type{};
-
-    template<class F>
-    struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>> : std::true_type{};
-
-    template<class F>
-    struct get_lambda_index_type {
-      typedef typename std::remove_pointer<
-                decltype(lambda_arg_helper(
-                      &camp::decay<F>::operator())
-                )
-              >::type type;
-    };
-
-    // If LAMBDA::operator() is not available this probably isn't a generic lambda and we can't extract and check args.
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {}
-
-    template<typename LAMBDA, typename... EXPECTED_ARGS>
-    constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&) {
-#if !defined(RAJA_ENABLE_HIP)
-      static_assert(is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type, EXPECTED_ARGS...>::value, "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match between RAJA::expt::Reduce() and ValOp arguments."); 
-#endif
-    }
-
-  } // namespace detail
 
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<1>)
+  {
+    return camp::get<param_tup_sz - 1>(param_tup).get_lambda_arg_tup();
+  }
 
-  template<typename Lambda, typename ForallParams>
-  constexpr 
-  void
-  check_forall_optional_args(Lambda&& l, ForallParams& fpp) {
+  template<camp::idx_t N>
+  RAJA_HOST_DEVICE constexpr auto LAMBDA_ARG_TUP_V(camp::num<N>)
+  {
+    return camp::tuple_cat_pair(
+        camp::get<param_tup_sz - N>(param_tup).get_lambda_arg_tup(),
+        LAMBDA_ARG_TUP_V(camp::num<N - 1>()));
+  }
 
-    using expected_arg_type_list = decltype( detail::list_add_lvalue_ref(
-                                               detail::list_remove_pointer(
-                                                 detail::tuple_to_list(
-                                                   fpp.lambda_args()
-                                                 )
-                                               )
-                                            ));
+public:
+  ForallParamPack() {}
 
-    detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list{});
+  RAJA_HOST_DEVICE constexpr lambda_arg_tuple_t lambda_args()
+  {
+    return LAMBDA_ARG_TUP_V(camp::num<sizeof...(Params)>());
   }
-  //===========================================================================
-  
 
+  using lambda_arg_seq =
+      camp::make_idx_seq_t<camp::tuple_size<lambda_arg_tuple_t>::value>;
 
-  //===========================================================================
-  //
-  //
-  // Type trailts for SFINAE work.
-  //
-  //
-  namespace type_traits
+  template<typename... Ts>
+  ForallParamPack(camp::tuple<Ts...>&& t) : param_tup(std::move(t)) {};
+};  // struct ForallParamPack
+
+//===========================================================================
+//
+//
+// ParamMultiplexer is how we hook into the individual calls within forall_impl.
+//
+//
+struct ParamMultiplexer
+{
+  template<typename EXEC_POL,
+           typename... Params,
+           typename... Args,
+           typename FP = ForallParamPack<Params...>>
+  static void constexpr init(ForallParamPack<Params...>& f_params,
+                             Args&&... args)
   {
-    template <typename T> struct is_ForallParamPack : std::false_type {};
-    template <typename... Args> struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type {};
+    FP::detail_init(EXEC_POL(), typename FP::params_seq(), f_params,
+                    std::forward<Args>(args)...);
+  }
 
-    template <typename T> struct is_ForallParamPack_empty : std::true_type {};
-    template <typename First, typename... Rest> struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>> : std::false_type {};
-    template <> struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type {};
+  template<typename EXEC_POL,
+           typename... Params,
+           typename... Args,
+           typename FP = ForallParamPack<Params...>>
+  static void constexpr combine(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
+  {
+    FP::detail_combine(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
-  //===========================================================================
-
-
-
-  //===========================================================================
-  //
-  //
-  // Invoke Forall with Params.
-  //
-  //
-  namespace detail {
-    template<camp::idx_t Idx, typename FP>
-    RAJA_HOST_DEVICE
-    constexpr
-    auto get_lambda_args(FP& fpp)
-        -> decltype(  *camp::get<Idx>( fpp.lambda_args() )  ) {
-      return (  *camp::get<Idx>( fpp.lambda_args() )  );
-    }
-
-    CAMP_SUPPRESS_HD_WARN
-    template <typename Fn,
-              camp::idx_t... Sequence,
-              typename Params,
-              typename... Ts>
-    RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
-                                                      Fn&& f,
-                                                      camp::idx_seq<Sequence...>,
-                                                      Ts&&... extra)
-    {
-      return f(std::forward<Ts...>(extra...), ( get_lambda_args<Sequence>(params) )...);
-    }
-  } // namespace detail
-
-  //CAMP_SUPPRESS_HD_WARN
-  template <typename Params, typename Fn, typename... Ts>
-  RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params, Fn&& f, Ts&&... extra)
+
+  template<typename EXEC_POL,
+           typename... Params,
+           typename... Args,
+           typename FP = ForallParamPack<Params...>>
+  static void constexpr resolve(ForallParamPack<Params...>& f_params,
+                                Args&&... args)
   {
-    return detail::invoke_with_order(
-        camp::forward<Params>(params),
-        camp::forward<Fn>(f),
-        typename camp::decay<Params>::lambda_arg_seq(),
-        camp::forward<Ts...>(extra)...);
+    FP::detail_resolve(EXEC_POL(), typename FP::params_seq(), f_params,
+                       std::forward<Args>(args)...);
   }
-  //===========================================================================
+};
+
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// ForallParamPack generators.
+//
+//
+RAJA_INLINE static auto get_empty_forall_param_pack()
+{
+  static ForallParamPack<> p;
+  return p;
+}
+
+namespace detail
+{
+// all_true trick to perform variadic expansion in static asserts.
+// https://stackoverflow.com/questions/36933176/how-do-you-static-assert-the-values-in-a-parameter-pack-of-a-variadic-template
+template<bool...>
+struct bool_pack;
+template<bool... bs>
+using all_true = std::is_same<bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+template<typename Base, typename... Ts>
+using check_types_derive_base =
+    all_true<std::is_convertible<Ts, Base>::value...>;
+}  // namespace detail
+
+template<typename... Ts>
+constexpr auto make_forall_param_pack_from_tuple(camp::tuple<Ts...>&& tuple)
+{
+  static_assert(detail::check_types_derive_base<detail::ForallParamBase,
+                                                camp::decay<Ts>...>::value,
+                "Forall optional arguments do not derive ForallParamBase. "
+                "Please see Reducer, ReducerLoc and KernelName for examples.");
+  return ForallParamPack<camp::decay<Ts>...>(std::move(tuple));
+}
+
+namespace detail
+{
+// Maybe we should do a lot of these with structs...
+template<camp::idx_t... Seq, typename TupleType>
+constexpr auto tuple_from_seq(const camp::idx_seq<Seq...>&, TupleType&& tuple)
+{
+  return camp::forward_as_tuple(
+      camp::get<Seq>(std::forward<TupleType>(tuple))...);
+};
+
+template<typename... Ts>
+constexpr auto strip_last_elem(camp::tuple<Ts...>&& tuple)
+{
+  return tuple_from_seq(camp::make_idx_seq_t<sizeof...(Ts) - 1> {},
+                        std::move(tuple));
+};
+}  // namespace detail
+
+// Make a tuple of the param pack except the final element...
+template<typename... Args>
+constexpr auto make_forall_param_pack(Args&&... args)
+{
+  // We assume the last element of the pack is the lambda so we need to strip it
+  // from the list.
+  auto stripped_arg_tuple = detail::strip_last_elem(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+  return make_forall_param_pack_from_tuple(std::move(stripped_arg_tuple));
+}
+
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Callable should be the last argument in the param pack, just extract it...
+//
+//
+template<typename... Args>
+constexpr auto&& get_lambda(Args&&... args)
+{
+  return camp::get<sizeof...(Args) - 1>(
+      camp::forward_as_tuple(std::forward<Args>(args)...));
+}
+
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Checking expected argument list against the assumed lambda.
+//
+//
+namespace detail
+{
+
+//
+//
+// Lambda traits Utilities
+//
+//
+template<class F>
+struct lambda_traits;
+
+template<class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...)>
+{  // non-const specialization
+  using arg_type = First;
+};
+
+template<class R, class C, class First, class... Rest>
+struct lambda_traits<R (C::*)(First, Rest...) const>
+{  // const specialization
+  using arg_type = First;
+};
+
+template<class T>
+typename lambda_traits<T>::arg_type* lambda_arg_helper(T);
+
+//
+//
+// List manipulation Utilities
+//
+//
+template<typename... Ts>
+constexpr auto list_remove_pointer(const camp::list<Ts...>&)
+{
+  return camp::list<camp::decay<typename std::remove_pointer<Ts>::type>...> {};
+}
+
+template<typename... Ts>
+constexpr auto list_add_lvalue_ref(const camp::list<Ts...>&)
+{
+  return camp::list<typename std::add_lvalue_reference<Ts>::type...> {};
+}
+
+template<typename... Ts>
+constexpr auto tuple_to_list(const camp::tuple<Ts...>&)
+{
+  return camp::list<Ts...> {};
+}
+
+// TODO : Change to std::is_invocable at c++17
+template<typename F, typename... Args>
+struct is_invocable
+    : std::is_constructible<
+          std::function<void(Args...)>,
+          std::reference_wrapper<typename std::remove_reference<F>::type>>
+{};
+
+template<class...>
+using void_t = void;
+
+template<class F, class = void>
+struct has_empty_op : std::false_type
+{};
+
+template<class F>
+struct has_empty_op<F, void_t<decltype(std::declval<F::operator()>)>>
+    : std::true_type
+{};
+
+template<class F>
+struct get_lambda_index_type
+{
+  typedef typename std::remove_pointer<decltype(lambda_arg_helper(
+      &camp::decay<F>::operator()))>::type type;
+};
+
+// If LAMBDA::operator() is not available this probably isn't a generic lambda
+// and we can't extract and check args.
+template<typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<concepts::negate<has_empty_op<LAMBDA>>>
+check_invocable(LAMBDA&&, const camp::list<EXPECTED_ARGS...>&)
+{}
+
+template<typename LAMBDA, typename... EXPECTED_ARGS>
+constexpr concepts::enable_if<has_empty_op<LAMBDA>> check_invocable(
+    LAMBDA&&,
+    const camp::list<EXPECTED_ARGS...>&)
+{
+#if !defined(RAJA_ENABLE_HIP)
+  static_assert(
+      is_invocable<LAMBDA, typename get_lambda_index_type<LAMBDA>::type,
+                   EXPECTED_ARGS...>::value,
+      "LAMBDA Not invocable w/ EXPECTED_ARGS. Ordering and types must match "
+      "between RAJA::expt::Reduce() and ValOp arguments.");
+#endif
+}
+
+}  // namespace detail
+
+template<typename Lambda, typename ForallParams>
+constexpr void check_forall_optional_args(Lambda&& l, ForallParams& fpp)
+{
+
+  using expected_arg_type_list = decltype(detail::list_add_lvalue_ref(
+      detail::list_remove_pointer(detail::tuple_to_list(fpp.lambda_args()))));
+
+  detail::check_invocable(std::forward<Lambda>(l), expected_arg_type_list {});
+}
+
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Type trailts for SFINAE work.
+//
+//
+namespace type_traits
+{
+template<typename T>
+struct is_ForallParamPack : std::false_type
+{};
+
+template<typename... Args>
+struct is_ForallParamPack<ForallParamPack<Args...>> : std::true_type
+{};
+
+template<typename T>
+struct is_ForallParamPack_empty : std::true_type
+{};
+
+template<typename First, typename... Rest>
+struct is_ForallParamPack_empty<ForallParamPack<First, Rest...>>
+    : std::false_type
+{};
+
+template<>
+struct is_ForallParamPack_empty<ForallParamPack<>> : std::true_type
+{};
+}  // namespace type_traits
+
+//===========================================================================
+
+
+//===========================================================================
+//
+//
+// Invoke Forall with Params.
+//
+//
+namespace detail
+{
+template<camp::idx_t Idx, typename FP>
+RAJA_HOST_DEVICE constexpr auto get_lambda_args(FP& fpp)
+    -> decltype(*camp::get<Idx>(fpp.lambda_args()))
+{
+  return (*camp::get<Idx>(fpp.lambda_args()));
+}
+
+CAMP_SUPPRESS_HD_WARN
+template<typename Fn, camp::idx_t... Sequence, typename Params, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_with_order(Params&& params,
+                                                  Fn&& f,
+                                                  camp::idx_seq<Sequence...>,
+                                                  Ts&&... extra)
+{
+  return f(std::forward<Ts...>(extra...),
+           (get_lambda_args<Sequence>(params))...);
+}
+}  // namespace detail
+
+// CAMP_SUPPRESS_HD_WARN
+template<typename Params, typename Fn, typename... Ts>
+RAJA_HOST_DEVICE constexpr auto invoke_body(Params&& params,
+                                            Fn&& f,
+                                            Ts&&... extra)
+{
+  return detail::invoke_with_order(
+      camp::forward<Params>(params), camp::forward<Fn>(f),
+      typename camp::decay<Params>::lambda_arg_seq(),
+      camp::forward<Ts...>(extra)...);
+}
+
+//===========================================================================
 
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  FORALL_PARAM_HPP
+#endif  //  FORALL_PARAM_HPP
diff --git a/include/RAJA/pattern/params/kernel_name.hpp b/include/RAJA/pattern/params/kernel_name.hpp
index e768d8dd59..2d26436c94 100644
--- a/include/RAJA/pattern/params/kernel_name.hpp
+++ b/include/RAJA/pattern/params/kernel_name.hpp
@@ -10,23 +10,22 @@ namespace expt
 namespace detail
 {
 
-  struct KernelName : public ForallParamBase {
-    RAJA_HOST_DEVICE KernelName() {}
-    KernelName(const char* name_in) : name(name_in) {}
-    const char* name;
-  };
+struct KernelName : public ForallParamBase
+{
+  RAJA_HOST_DEVICE KernelName() {}
 
-} // namespace detail
+  KernelName(const char* name_in) : name(name_in) {}
 
-inline auto KernelName(const char * n)
-{
-  return detail::KernelName(n);
-}
-} // namespace expt
+  const char* name;
+};
+
+}  // namespace detail
 
+inline auto KernelName(const char* n) { return detail::KernelName(n); }
+}  // namespace expt
 
-} //  namespace RAJA
 
+}  //  namespace RAJA
 
 
-#endif // KERNEL_NAME_HPP
+#endif  // KERNEL_NAME_HPP
diff --git a/include/RAJA/pattern/params/params_base.hpp b/include/RAJA/pattern/params/params_base.hpp
index 98380f6ffc..27f2adec7a 100644
--- a/include/RAJA/pattern/params/params_base.hpp
+++ b/include/RAJA/pattern/params/params_base.hpp
@@ -1,135 +1,278 @@
 #ifndef RAJA_PARAMS_BASE
 #define RAJA_PARAMS_BASE
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename T, typename IndexType = RAJA::Index_type>
-  struct ValLoc {
-    using index_type = IndexType;
-    using value_type = T;
-
-    ValLoc() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l) {}
-
-    ValLoc(ValLoc const &) = default;
-    ValLoc(ValLoc &&) = default;
-    ValLoc& operator=(ValLoc const &) = default;
-    ValLoc& operator=(ValLoc &&) = default;
-
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const { return val > rhs.val; }
-
-    RAJA_HOST_DEVICE constexpr const value_type& getVal() const {return val;}
-    RAJA_HOST_DEVICE constexpr const index_type& getLoc() const {return loc;}
-
-    RAJA_HOST_DEVICE void set(T inval, IndexType inindex) {val = inval; loc = inindex;}
-    RAJA_HOST_DEVICE void setVal(T inval) {val = inval;}
-    RAJA_HOST_DEVICE void setLoc(IndexType inindex) {loc = inindex;}
-
-    value_type val;
-    index_type loc = -1;
-  };
-
-  template<typename T, template <typename, typename, typename> class Op>
-  struct ValOp {
-    using value_type = T;
-    using op_type = Op<T,T,T>;
-
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::plus<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator+=(const value_type& rhs) { val += rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator&=(const value_type& rhs) { val &= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & operator|=(const value_type& rhs) { val |= rhs; return *this; }
-
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_and<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator&=(value_type& rhs) { val &= rhs; return *this; }
+template<typename T, typename IndexType = RAJA::Index_type>
+struct ValLoc
+{
+  using index_type = IndexType;
+  using value_type = T;
 
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::bit_or<T,T,T>>::value> * = nullptr>
-    RAJA_HOST_DEVICE ValOp & operator|=(value_type& rhs) { val |= rhs; return *this; }
+  ValLoc() = default;
 
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { val < rhs.val; return *this; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { val > rhs.val; return *this; }
+  RAJA_HOST_DEVICE constexpr explicit ValLoc(value_type v) : val(v) {}
 
-    value_type val = op_type::identity();
-  };
+  RAJA_HOST_DEVICE constexpr ValLoc(value_type v, index_type l) : val(v), loc(l)
+  {}
 
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  struct ValOp <ValLoc<T,IndexType>, Op> {
-    using index_type = IndexType;
-    using value_type = ValLoc<T,index_type>;
-    using op_type = Op<value_type,value_type,value_type>;
-    using valloc_value_type = typename value_type::value_type;
-    using valloc_index_type = typename value_type::index_type;
+  ValLoc(ValLoc const&)            = default;
+  ValLoc(ValLoc&&)                 = default;
+  ValLoc& operator=(ValLoc const&) = default;
+  ValLoc& operator=(ValLoc&&)      = default;
 
-    ValOp() = default;
-    RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
-    RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l) : val(v, l) {}
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValLoc& rhs) const
+  {
+    return val < rhs.val;
+  }
 
-    ValOp(ValOp const &) = default;
-    ValOp(ValOp &&) = default;
-    ValOp& operator=(ValOp const &) = default;
-    ValOp& operator=(ValOp &&) = default;
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValLoc& rhs) const
+  {
+    return val > rhs.val;
+  }
 
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & min(value_type v) { if (v < val) { val = v; } return *this; }
+  RAJA_HOST_DEVICE constexpr const value_type& getVal() const { return val; }
 
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & max(value_type v) { if (v > val) { val = v; } return *this; }
+  RAJA_HOST_DEVICE constexpr const index_type& getLoc() const { return loc; }
 
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::minimum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & minloc(valloc_value_type v, valloc_index_type l) { return min(value_type(v,l)); }
+  RAJA_HOST_DEVICE void set(T inval, IndexType inindex)
+  {
+    val = inval;
+    loc = inindex;
+  }
 
-    template <typename U = op_type, std::enable_if_t<std::is_same<U, RAJA::operators::maximum<value_type,value_type,value_type>>::value> * = nullptr>
-    RAJA_HOST_DEVICE constexpr ValOp & maxloc(valloc_value_type v, valloc_index_type l) { return max(value_type(v,l)); }
+  RAJA_HOST_DEVICE void setVal(T inval) { val = inval; }
 
-    RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const { return val < rhs.val; }
-    RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const { return val > rhs.val; }
+  RAJA_HOST_DEVICE void setLoc(IndexType inindex) { loc = inindex; }
 
-    value_type val = op_type::identity();
-  };
+  value_type val;
+  index_type loc = -1;
+};
 
-  template<typename T, typename IndexType, template <typename, typename, typename> class Op>
-  using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
+template<typename T, template<typename, typename, typename> class Op>
+struct ValOp
+{
+  using value_type = T;
+  using op_type    = Op<T, T, T>;
+
+  ValOp() = default;
+
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::minimum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::maximum<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::plus<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator+=(const value_type& rhs)
+  {
+    val += rhs;
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator&=(const value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& operator|=(const value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_and<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator&=(value_type& rhs)
+  {
+    val &= rhs;
+    return *this;
+  }
+
+  template<
+      typename U = op_type,
+      std::enable_if_t<
+          std::is_same<U, RAJA::operators::bit_or<T, T, T>>::value>* = nullptr>
+  RAJA_HOST_DEVICE ValOp& operator|=(value_type& rhs)
+  {
+    val |= rhs;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    val < rhs.val;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    val > rhs.val;
+    return *this;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template<typename T,
+         typename IndexType,
+         template<typename, typename, typename>
+         class Op>
+struct ValOp<ValLoc<T, IndexType>, Op>
+{
+  using index_type        = IndexType;
+  using value_type        = ValLoc<T, index_type>;
+  using op_type           = Op<value_type, value_type, value_type>;
+  using valloc_value_type = typename value_type::value_type;
+  using valloc_index_type = typename value_type::index_type;
+
+  ValOp() = default;
+
+  RAJA_HOST_DEVICE constexpr explicit ValOp(value_type v) : val(v) {}
+
+  RAJA_HOST_DEVICE constexpr ValOp(valloc_value_type v, valloc_index_type l)
+      : val(v, l)
+  {}
+
+  ValOp(ValOp const&)            = default;
+  ValOp(ValOp&&)                 = default;
+  ValOp& operator=(ValOp const&) = default;
+  ValOp& operator=(ValOp&&)      = default;
+
+  template<typename U                   = op_type,
+           std::enable_if_t<std::is_same<
+               U,
+               RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& min(value_type v)
+  {
+    if (v < val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template<typename U                   = op_type,
+           std::enable_if_t<std::is_same<
+               U,
+               RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& max(value_type v)
+  {
+    if (v > val)
+    {
+      val = v;
+    }
+    return *this;
+  }
+
+  template<typename U                   = op_type,
+           std::enable_if_t<std::is_same<
+               U,
+               RAJA::operators::minimum<value_type, value_type, value_type>>::
+                                value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& minloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return min(value_type(v, l));
+  }
+
+  template<typename U                   = op_type,
+           std::enable_if_t<std::is_same<
+               U,
+               RAJA::operators::maximum<value_type, value_type, value_type>>::
+                                value>* = nullptr>
+  RAJA_HOST_DEVICE constexpr ValOp& maxloc(valloc_value_type v,
+                                           valloc_index_type l)
+  {
+    return max(value_type(v, l));
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator<(const ValOp& rhs) const
+  {
+    return val < rhs.val;
+  }
+
+  RAJA_HOST_DEVICE constexpr bool operator>(const ValOp& rhs) const
+  {
+    return val > rhs.val;
+  }
+
+  value_type val = op_type::identity();
+};
+
+template<typename T,
+         typename IndexType,
+         template<typename, typename, typename>
+         class Op>
+using ValLocOp = ValOp<ValLoc<T, IndexType>, Op>;
 
 namespace detail
 {
 
-  struct ForallParamBase {
+struct ForallParamBase
+{
+
+  // Some of this can be made virtual in c++20, for now must be defined in each
+  // child class if any arguments to the forall lambda are needed (e.g.
+  // KernelName is excluded.)
+  using ARG_TUP_T  = camp::tuple<>;
+  using ARG_LIST_T = typename ARG_TUP_T::TList;
+
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
 
-    // Some of this can be made virtual in c++20, for now must be defined in each child class
-    // if any arguments to the forall lambda are needed (e.g. KernelName is excluded.)
-    using ARG_TUP_T = camp::tuple<>; 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(); }
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
-  
-  };
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
-} // namespace expt
+}  // namespace expt
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  RAJA_PARAMS_BASE
+#endif  //  RAJA_PARAMS_BASE
diff --git a/include/RAJA/pattern/params/reducer.hpp b/include/RAJA/pattern/params/reducer.hpp
index 78b6d7714d..bb8595f621 100644
--- a/include/RAJA/pattern/params/reducer.hpp
+++ b/include/RAJA/pattern/params/reducer.hpp
@@ -18,21 +18,25 @@ namespace RAJA
 namespace operators
 {
 
-template <typename T, typename IndexType>
-struct limits<RAJA::expt::ValLoc<T, IndexType>> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> min()
+template<typename T, typename IndexType>
+struct limits<RAJA::expt::ValLoc<T, IndexType>>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  min()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::min());
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType> max()
+
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr RAJA::expt::ValLoc<T, IndexType>
+  max()
   {
     return RAJA::expt::ValLoc<T, IndexType>(RAJA::operators::limits<T>::max());
   }
 };
 
-} //  namespace operators
+}  //  namespace operators
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
 namespace RAJA
 {
@@ -43,159 +47,201 @@ namespace detail
 {
 
 #if defined(RAJA_CUDA_ACTIVE)
-  using device_mem_pool_t = RAJA::cuda::device_mempool_type;
+using device_mem_pool_t = RAJA::cuda::device_mempool_type;
 #elif defined(RAJA_HIP_ACTIVE)
-  using device_mem_pool_t = RAJA::hip::device_mempool_type;
+using device_mem_pool_t = RAJA::hip::device_mempool_type;
 #elif defined(RAJA_SYCL_ACTIVE)
-  using device_mem_pool_t = RAJA::sycl::device_mempool_type;
+using device_mem_pool_t = RAJA::sycl::device_mempool_type;
 #endif
 
-  //
-  //
-  // Basic Reducer
-  //
-  //
-
-  // Basic data type Reducer
-  // T must be a basic data type
-  // VOp must be ValOp<T, Op>
-  template <typename Op, typename T, typename VOp>
-  struct Reducer : public ForallParamBase {
-    using op = Op;
-    using value_type = T; // This is a basic data type
-
-    Reducer() = default;
-
-    // Basic data type constructor
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target(target_in){}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // Internal ValOp object that is used within RAJA::forall/launch
-    VOp m_valop = VOp{};
-
-    // Points to the user specified result variable
-    value_type *target = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      value_type temp = op{}(*target, in);
-      *target = temp;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+//
+//
+// Basic Reducer
+//
+//
+
+// Basic data type Reducer
+// T must be a basic data type
+// VOp must be ValOp<T, Op>
+template<typename Op, typename T, typename VOp>
+struct Reducer : public ForallParamBase
+{
+  using op         = Op;
+  using value_type = T;  // This is a basic data type
+
+  Reducer() = default;
+
+  // Basic data type constructor
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}),
+        target(target_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // Internal ValOp object that is used within RAJA::forall/launch
+  VOp m_valop = VOp {};
+
+  // Points to the user specified result variable
+  value_type* target = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    value_type temp = op {}(*target, in);
+    *target         = temp;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
-
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
-
-  // Partial specialization of Reducer for ValLoc
-  // T is a deduced basic data type
-  // I is a deduced index type
-  template <typename T, typename I, template <typename, typename, typename> class Op>
-  struct Reducer<Op<ValLoc<T,I>, ValLoc<T,I>, ValLoc<T,I>>, ValLoc<T,I>, ValOp<ValLoc<T,I>, Op>> : public ForallParamBase {
-    using target_value_type = T;
-    using target_index_type = I;
-    using value_type = ValLoc<T,I>;
-    using op = Op<value_type,value_type,value_type>;
-    using VOp = ValOp<ValLoc<target_value_type,target_index_type>, Op>;
-
-    Reducer() = default;
-
-    // ValLoc constructor
-    // Note that the target_ variables point to the val and loc within the user defined target ValLoc
-    RAJA_HOST_DEVICE Reducer(value_type *target_in) : m_valop(VOp{}), target_value(&target_in->val), target_index(&target_in->loc) {}
-
-    // Dual input constructor for ReduceLoc<>(data, index) case
-    // The target_ variables point to vars defined by the user
-    RAJA_HOST_DEVICE Reducer(target_value_type *data_in, target_index_type *index_in) : m_valop(VOp{}), target_value(data_in), target_index(index_in) {}
-
-    Reducer(Reducer const &) = default;
-    Reducer(Reducer &&) = default;
-    Reducer& operator=(Reducer const &) = default;
-    Reducer& operator=(Reducer &&) = default;
-
-    // The ValLoc within m_valop is initialized with data and location values from either a ValLoc, or dual data and location values, passed into the constructor
-    VOp m_valop = VOp{};
-
-    // Points to either dual value and index defined by the user, or value and index within a ValLoc defined by the user
-    target_value_type *target_value = nullptr;
-    target_index_type *target_index = nullptr;
-
-    // combineTarget() performs the final op on the target data and location in resolve()
-    RAJA_HOST_DEVICE void combineTarget(value_type in)
-    {
-      // Create a different temp ValLoc solely for combining
-      value_type temp(*target_value, *target_index);
-      temp = op{}(temp, in);
-      *target_value = temp.val;
-      *target_index = temp.loc;
-    }
-
-    RAJA_HOST_DEVICE
-    value_type &
-    getVal() { return m_valop.val; }
-
-#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) || defined(RAJA_SYCL_ACTIVE)
-    // Device related attributes.
-    value_type * devicetarget = nullptr;
-    RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
-    unsigned int * device_count = nullptr;
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
+
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
+
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
+
+// Partial specialization of Reducer for ValLoc
+// T is a deduced basic data type
+// I is a deduced index type
+template<typename T,
+         typename I,
+         template<typename, typename, typename>
+         class Op>
+struct Reducer<Op<ValLoc<T, I>, ValLoc<T, I>, ValLoc<T, I>>,
+               ValLoc<T, I>,
+               ValOp<ValLoc<T, I>, Op>> : public ForallParamBase
+{
+  using target_value_type = T;
+  using target_index_type = I;
+  using value_type        = ValLoc<T, I>;
+  using op                = Op<value_type, value_type, value_type>;
+  using VOp = ValOp<ValLoc<target_value_type, target_index_type>, Op>;
+
+  Reducer() = default;
+
+  // ValLoc constructor
+  // Note that the target_ variables point to the val and loc within the user
+  // defined target ValLoc
+  RAJA_HOST_DEVICE Reducer(value_type* target_in)
+      : m_valop(VOp {}),
+        target_value(&target_in->val),
+        target_index(&target_in->loc)
+  {}
+
+  // Dual input constructor for ReduceLoc<>(data, index) case
+  // The target_ variables point to vars defined by the user
+  RAJA_HOST_DEVICE Reducer(target_value_type* data_in,
+                           target_index_type* index_in)
+      : m_valop(VOp {}),
+        target_value(data_in),
+        target_index(index_in)
+  {}
+
+  Reducer(Reducer const&)            = default;
+  Reducer(Reducer&&)                 = default;
+  Reducer& operator=(Reducer const&) = default;
+  Reducer& operator=(Reducer&&)      = default;
+
+  // The ValLoc within m_valop is initialized with data and location values from
+  // either a ValLoc, or dual data and location values, passed into the
+  // constructor
+  VOp m_valop = VOp {};
+
+  // Points to either dual value and index defined by the user, or value and
+  // index within a ValLoc defined by the user
+  target_value_type* target_value = nullptr;
+  target_index_type* target_index = nullptr;
+
+  // combineTarget() performs the final op on the target data and location in
+  // resolve()
+  RAJA_HOST_DEVICE void combineTarget(value_type in)
+  {
+    // Create a different temp ValLoc solely for combining
+    value_type temp(*target_value, *target_index);
+    temp          = op {}(temp, in);
+    *target_value = temp.val;
+    *target_index = temp.loc;
+  }
+
+  RAJA_HOST_DEVICE
+  value_type& getVal() { return m_valop.val; }
+
+#if defined(RAJA_CUDA_ACTIVE) || defined(RAJA_HIP_ACTIVE) ||                   \
+    defined(RAJA_SYCL_ACTIVE)
+  // Device related attributes.
+  value_type* devicetarget = nullptr;
+  RAJA::detail::SoAPtr<value_type, device_mem_pool_t> device_mem;
+  unsigned int* device_count = nullptr;
 #endif
 
-    // These are types and parameters extracted from this struct, and given to the forall.
-    using ARG_TUP_T = camp::tuple<VOp*>;
-    RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup() { return camp::make_tuple(&m_valop); }
+  // These are types and parameters extracted from this struct, and given to the
+  // forall.
+  using ARG_TUP_T = camp::tuple<VOp*>;
 
-    using ARG_LIST_T = typename ARG_TUP_T::TList;
-    static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value ;
-  };
+  RAJA_HOST_DEVICE ARG_TUP_T get_lambda_arg_tup()
+  {
+    return camp::make_tuple(&m_valop);
+  }
+
+  using ARG_LIST_T                        = typename ARG_TUP_T::TList;
+  static constexpr size_t num_lambda_args = camp::tuple_size<ARG_TUP_T>::value;
+};
 
-} // namespace detail
+}  // namespace detail
 
 // Standard use case.
-template <template <typename, typename, typename> class Op, typename T>
-auto constexpr Reduce(T *target)
+template<template<typename, typename, typename> class Op, typename T>
+auto constexpr Reduce(T* target)
 {
-  return detail::Reducer<Op<T,T,T>, T, ValOp<T, Op>>(target);
+  return detail::Reducer<Op<T, T, T>, T, ValOp<T, Op>>(target);
 }
 
 // User-defined ValLoc case.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr Reduce(ValLoc<T, IndexType> *target)
+template<template<typename, typename, typename> class Op,
+         typename T,
+         typename IndexType>
+auto constexpr Reduce(ValLoc<T, IndexType>* target)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target);
 }
 
-// Dual input use case where reduction value and location are separate, non-ValLoc types supplied by the user.
-template <template <typename, typename, typename> class Op, typename T, typename IndexType>
-auto constexpr ReduceLoc(T *target, IndexType *index)
+// Dual input use case where reduction value and location are separate,
+// non-ValLoc types supplied by the user.
+template<template<typename, typename, typename> class Op,
+         typename T,
+         typename IndexType>
+auto constexpr ReduceLoc(T* target, IndexType* index)
 {
-  using VL = ValLoc<T,IndexType>;
-  return detail::Reducer<Op<VL,VL,VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(target, index);
+  using VL = ValLoc<T, IndexType>;
+  return detail::Reducer<Op<VL, VL, VL>, VL, ValOp<ValLoc<T, IndexType>, Op>>(
+      target, index);
 }
 
-} // namespace expt
+}  // namespace expt
 
 
-} //  namespace RAJA
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_HPP
+#endif  //  NEW_REDUCE_HPP
diff --git a/include/RAJA/pattern/reduce.hpp b/include/RAJA/pattern/reduce.hpp
index 0c0eaf3efb..c9ff15b732 100644
--- a/include/RAJA/pattern/reduce.hpp
+++ b/include/RAJA/pattern/reduce.hpp
@@ -76,7 +76,7 @@ namespace RAJA
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T>
+template<typename REDUCE_POLICY_T, typename T>
 class ReduceMin;
 
 /*!
@@ -102,7 +102,7 @@ class ReduceMin;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T, typename IndexType = Index_type>
+template<typename REDUCE_POLICY_T, typename T, typename IndexType = Index_type>
 class ReduceMinLoc;
 
 /*!
@@ -127,7 +127,7 @@ class ReduceMinLoc;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T>
+template<typename REDUCE_POLICY_T, typename T>
 class ReduceMax;
 
 /*!
@@ -153,7 +153,7 @@ class ReduceMax;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T, typename IndexType = Index_type>
+template<typename REDUCE_POLICY_T, typename T, typename IndexType = Index_type>
 class ReduceMaxLoc;
 
 /*!
@@ -178,7 +178,7 @@ class ReduceMaxLoc;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T>
+template<typename REDUCE_POLICY_T, typename T>
 class ReduceSum;
 
 /*!
@@ -203,9 +203,9 @@ class ReduceSum;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T>
+template<typename REDUCE_POLICY_T, typename T>
 class ReduceBitOr;
- 
+
 
 /*!
  ******************************************************************************
@@ -229,9 +229,9 @@ class ReduceBitOr;
  *
  ******************************************************************************
  */
-template <typename REDUCE_POLICY_T, typename T>
+template<typename REDUCE_POLICY_T, typename T>
 class ReduceBitAnd;
-} //namespace RAJA
+}  // namespace RAJA
 
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/pattern/region.hpp b/include/RAJA/pattern/region.hpp
index a79422fa7b..d4f78cf396 100644
--- a/include/RAJA/pattern/region.hpp
+++ b/include/RAJA/pattern/region.hpp
@@ -26,13 +26,13 @@
 namespace RAJA
 {
 
-template <typename ExecutionPolicy, typename LoopBody>
+template<typename ExecutionPolicy, typename LoopBody>
 void region(LoopBody&& loop_body)
 {
   region_impl(ExecutionPolicy(), loop_body);
 }
 
-template <typename ExecutionPolicy, typename OuterBody, typename InnerBody>
+template<typename ExecutionPolicy, typename OuterBody, typename InnerBody>
 void region(OuterBody&& outer_body, InnerBody&& inner_body)
 {
   region_impl(ExecutionPolicy(), outer_body, inner_body);
diff --git a/include/RAJA/pattern/scan.hpp b/include/RAJA/pattern/scan.hpp
index 0f46ee0a22..31fdf139c0 100644
--- a/include/RAJA/pattern/scan.hpp
+++ b/include/RAJA/pattern/scan.hpp
@@ -46,20 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
+template<
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Res r,
                        Container&& c,
-                       Function binop = Function{})
+                       Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -68,32 +69,33 @@ inclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop);
+  return impl::scan::inclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop);
 }
+
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template<
+    typename ExecPolicy,
+    typename Container,
+    typename Function = operators::plus<RAJA::detail::ContainerVal<Container>>,
+    typename Res      = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 inclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{})
+                       Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop);
 }
 
 /*!
@@ -108,22 +110,22 @@ inclusive_scan_inplace(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
-          typename Function = operators::plus<T>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<Container>>
+template<typename ExecPolicy,
+         typename Res,
+         typename Container,
+         typename T        = RAJA::detail::ContainerVal<Container>,
+         typename Function = operators::plus<T>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<Container>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Res r,
                        Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+                       Function binop = Function {},
+                       T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -132,35 +134,34 @@ exclusive_scan_inplace(ExecPolicy&& p,
                 "Function must model BinaryFunction");
   static_assert(type_traits::is_random_access_range<Container>::value,
                 "Container must model RandomAccessRange");
-  if (begin(c) == end(c)) {
+  if (begin(c) == end(c))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p),
-                                       begin(c), end(c), binop, value);
+  return impl::scan::exclusive_inplace(r, std::forward<ExecPolicy>(p), begin(c),
+                                       end(c), binop, value);
 }
+
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename T = RAJA::detail::ContainerVal<Container>,
-          typename Function = operators::plus<T>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
+template<typename ExecPolicy,
+         typename Container,
+         typename T        = RAJA::detail::ContainerVal<Container>,
+         typename Function = operators::plus<T>,
+         typename Res      = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
 exclusive_scan_inplace(ExecPolicy&& p,
                        Container&& c,
-                       Function binop = Function{},
-                       T value = Function::identity())
+                       Function binop = Function {},
+                       T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), binop, value);
 }
 
 /*!
@@ -179,23 +180,24 @@ exclusive_scan_inplace(ExecPolicy&& p,
 *begin))}
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename InContainer,
-          typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
+template<typename ExecPolicy,
+         typename Res,
+         typename InContainer,
+         typename OutContainer,
+         typename Function =
+             operators::plus<RAJA::detail::ContainerVal<InContainer>>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                Res r,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{})
+               Function binop = Function {})
 {
   using std::begin;
   using std::end;
@@ -207,36 +209,37 @@ inclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop);
+  return impl::scan::inclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop);
 }
+
 ///
-template <typename ExecPolicy,
-          typename InContainer,
-          typename OutContainer,
-          typename Function = operators::plus<RAJA::detail::ContainerVal<InContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+template<typename ExecPolicy,
+         typename InContainer,
+         typename OutContainer,
+         typename Function =
+             operators::plus<RAJA::detail::ContainerVal<InContainer>>,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 inclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{})
+               Function binop = Function {})
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop);
 }
 
 /*!
@@ -255,25 +258,25 @@ inclusive_scan(ExecPolicy&& p,
 *begin))}
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename InContainer,
-          typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
-          typename Function = operators::plus<T>>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>,
-                      std::is_constructible<camp::resources::Resource, Res>,
-                      type_traits::is_range<InContainer>,
-                      type_traits::is_range<OutContainer>>
+template<typename ExecPolicy,
+         typename Res,
+         typename InContainer,
+         typename OutContainer,
+         typename T        = RAJA::detail::ContainerVal<InContainer>,
+         typename Function = operators::plus<T>>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_resource<Res>,
+    std::is_constructible<camp::resources::Resource, Res>,
+    type_traits::is_range<InContainer>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                Res r,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+               Function binop = Function {},
+               T value        = Function::identity())
 {
   using std::begin;
   using std::end;
@@ -285,43 +288,41 @@ exclusive_scan(ExecPolicy&& p,
                 "InContainer must model RandomAccessRange");
   static_assert(type_traits::is_random_access_range<OutContainer>::value,
                 "OutContainer must model RandomAccessRange");
-  if (begin(in) == end(in)) {
+  if (begin(in) == end(in))
+  {
     return resources::EventProxy<Res>(r);
   }
-  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p),
-                               begin(in), end(in), begin(out), binop, value);
+  return impl::scan::exclusive(r, std::forward<ExecPolicy>(p), begin(in),
+                               end(in), begin(out), binop, value);
 }
+
 ///
-template <typename ExecPolicy,
-          typename InContainer,
-          typename OutContainer,
-          typename T = RAJA::detail::ContainerVal<InContainer>,
-          typename Function = operators::plus<T>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<InContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, InContainer>>,
-                      type_traits::is_range<OutContainer>>
+template<typename ExecPolicy,
+         typename InContainer,
+         typename OutContainer,
+         typename T        = RAJA::detail::ContainerVal<InContainer>,
+         typename Function = operators::plus<T>,
+         typename Res      = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<InContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, InContainer>>,
+    type_traits::is_range<OutContainer>>
 exclusive_scan(ExecPolicy&& p,
                InContainer&& in,
                OutContainer&& out,
-               Function binop = Function{},
-               T value = Function::identity())
+               Function binop = Function {},
+               T value        = Function::identity())
 {
   auto r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<InContainer>(in),
-      std::forward<OutContainer>(out),
-      binop,
-      value);
+      std::forward<ExecPolicy>(p), r, std::forward<InContainer>(in),
+      std::forward<OutContainer>(out), binop, value);
 }
 
-}  // end inline namespace policy_by_value_interface
-
+}  // namespace policy_by_value_interface
 
 /*!
  * \brief Conversion from template-based policy to value-based policy for
@@ -329,23 +330,23 @@ exclusive_scan(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type >
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+template<typename ExecPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan(
@@ -358,23 +359,23 @@ exclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+template<typename ExecPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan(
@@ -387,23 +388,23 @@ inclusive_scan(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 exclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+template<typename ExecPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 exclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::exclusive_scan_inplace(
@@ -416,23 +417,23 @@ exclusive_scan_inplace(Res r, Args&&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>>
 inclusive_scan_inplace(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_resource<Res>>
+template<typename ExecPolicy, typename Res, typename... Args>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<Res>,
+                                  type_traits::is_execution_policy<ExecPolicy>,
+                                  type_traits::is_resource<Res>>
 inclusive_scan_inplace(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::inclusive_scan_inplace(
diff --git a/include/RAJA/pattern/sort.hpp b/include/RAJA/pattern/sort.hpp
index acf3fe5ba7..69de072168 100644
--- a/include/RAJA/pattern/sort.hpp
+++ b/include/RAJA/pattern/sort.hpp
@@ -46,23 +46,21 @@ inline namespace policy_by_value_interface
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template<
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-sort(ExecPolicy&& p,
-     Res r,
-     Container&& c,
-     Compare comp = Compare{})
+sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -71,34 +69,36 @@ sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::unstable(r, std::forward<ExecPolicy>(p),
-                                begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable(r, std::forward<ExecPolicy>(p), begin_it,
+                                end_it, comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
+
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-sort(ExecPolicy&& p,
-     Container&& c,
-     Compare comp = Compare{})
+template<
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -113,23 +113,21 @@ sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
+template<
+    typename ExecPolicy,
+    typename Res,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
                       std::is_constructible<camp::resources::Resource, Res>,
                       type_traits::is_range<Container>>
-stable_sort(ExecPolicy&& p,
-            Res r,
-            Container&& c,
-            Compare comp = Compare{})
+stable_sort(ExecPolicy&& p, Res r, Container&& c, Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<Container>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -138,34 +136,36 @@ stable_sort(ExecPolicy&& p,
 
   auto begin_it = begin(c);
   auto end_it   = end(c);
-  auto N = distance(begin_it, end_it);
+  auto N        = distance(begin_it, end_it);
 
-  if (N > 1) {
-    return impl::sort::stable(r, std::forward<ExecPolicy>(p),
-                              begin_it, end_it, comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable(r, std::forward<ExecPolicy>(p), begin_it, end_it,
+                              comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
+
 ///
-template <typename ExecPolicy,
-          typename Container,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<Container>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, Container>>>
-stable_sort(ExecPolicy&& p,
-            Container&& c,
-            Compare comp = Compare{})
+template<
+    typename ExecPolicy,
+    typename Container,
+    typename Compare = operators::less<RAJA::detail::ContainerVal<Container>>,
+    typename Res     = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<Container>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, Container>>>
+stable_sort(ExecPolicy&& p, Container&& c, Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<Container>(c),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<Container>(c), comp);
 }
 
 /*!
@@ -181,11 +181,12 @@ stable_sort(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+template<typename ExecPolicy,
+         typename Res,
+         typename KeyContainer,
+         typename ValContainer,
+         typename Compare =
+             operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -196,11 +197,11 @@ sort_pairs(ExecPolicy&& p,
            Res r,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -211,38 +212,42 @@ sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p),
-                                      begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::unstable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                      end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
+
 ///
-template <typename ExecPolicy,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+template<typename ExecPolicy,
+         typename KeyContainer,
+         typename ValContainer,
+         typename Compare =
+             operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 sort_pairs(ExecPolicy&& p,
            KeyContainer&& keys,
            ValContainer&& vals,
-           Compare comp = Compare{})
+           Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
 /*!
@@ -258,11 +263,12 @@ sort_pairs(ExecPolicy&& p,
 *
 ******************************************************************************
 */
-template <typename ExecPolicy,
-          typename Res,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
+template<typename ExecPolicy,
+         typename Res,
+         typename KeyContainer,
+         typename ValContainer,
+         typename Compare =
+             operators::less<RAJA::detail::ContainerVal<KeyContainer>>>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>,
@@ -273,11 +279,11 @@ stable_sort_pairs(ExecPolicy&& p,
                   Res r,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   using std::begin;
-  using std::end;
   using std::distance;
+  using std::end;
   using T = RAJA::detail::ContainerVal<KeyContainer>;
   static_assert(type_traits::is_binary_function<Compare, bool, T, T>::value,
                 "Compare must model BinaryFunction");
@@ -288,41 +294,45 @@ stable_sort_pairs(ExecPolicy&& p,
 
   auto begin_key = begin(keys);
   auto end_key   = end(keys);
-  auto N = distance(begin_key, end_key);
+  auto N         = distance(begin_key, end_key);
 
-  if (N > 1) {
-    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p),
-                                    begin_key, end_key, begin(vals), comp);
-  } else {
+  if (N > 1)
+  {
+    return impl::sort::stable_pairs(r, std::forward<ExecPolicy>(p), begin_key,
+                                    end_key, begin(vals), comp);
+  }
+  else
+  {
     return resources::EventProxy<Res>(r);
   }
 }
+
 ///
-template <typename ExecPolicy,
-          typename KeyContainer,
-          typename ValContainer,
-          typename Compare = operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
-concepts::enable_if_t<resources::EventProxy<Res>,
-                      type_traits::is_execution_policy<ExecPolicy>,
-                      type_traits::is_range<KeyContainer>,
-                      concepts::negate<std::is_constructible<camp::resources::Resource, KeyContainer>>,
-                      type_traits::is_range<ValContainer>>
+template<typename ExecPolicy,
+         typename KeyContainer,
+         typename ValContainer,
+         typename Compare =
+             operators::less<RAJA::detail::ContainerVal<KeyContainer>>,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
+concepts::enable_if_t<
+    resources::EventProxy<Res>,
+    type_traits::is_execution_policy<ExecPolicy>,
+    type_traits::is_range<KeyContainer>,
+    concepts::negate<
+        std::is_constructible<camp::resources::Resource, KeyContainer>>,
+    type_traits::is_range<ValContainer>>
 stable_sort_pairs(ExecPolicy&& p,
                   KeyContainer&& keys,
                   ValContainer&& vals,
-                  Compare comp = Compare{})
+                  Compare comp = Compare {})
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
-      std::forward<ExecPolicy>(p),
-      r,
-      std::forward<KeyContainer>(keys),
-      std::forward<ValContainer>(vals),
-      comp);
+      std::forward<ExecPolicy>(p), r, std::forward<KeyContainer>(keys),
+      std::forward<ValContainer>(vals), comp);
 }
 
-}  // end inline namespace policy_by_value_interface
+}  // namespace policy_by_value_interface
 
 // =============================================================================
 
@@ -332,25 +342,27 @@ stable_sort_pairs(ExecPolicy&& p,
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort(Args &&... args)
+sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
+template<typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort(Res r, Args &&... args)
+sort(Res r, Args&&... args)
 {
-  return ::RAJA::policy_by_value_interface::sort(
-      ExecPolicy(), r, std::forward<Args>(args)...);
+  return ::RAJA::policy_by_value_interface::sort(ExecPolicy(), r,
+                                                 std::forward<Args>(args)...);
 }
 
 /*!
@@ -359,22 +371,24 @@ sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort(Args &&... args)
+stable_sort(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
+template<typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort(Res r, Args &&... args)
+stable_sort(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -386,22 +400,24 @@ stable_sort(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-sort_pairs(Args &&... args)
+sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::sort_pairs<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
+template<typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-sort_pairs(Res r, Args &&... args)
+sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
@@ -413,22 +429,24 @@ sort_pairs(Res r, Args &&... args)
  *
  * this reduces implementation overhead and perfectly forwards all arguments
  */
-template <typename ExecPolicy, typename... Args,
-          typename Res = typename resources::get_resource<ExecPolicy>::type>
+template<typename ExecPolicy,
+         typename... Args,
+         typename Res = typename resources::get_resource<ExecPolicy>::type>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>>
-stable_sort_pairs(Args &&... args)
+stable_sort_pairs(Args&&... args)
 {
   Res r = Res::get_default();
   return ::RAJA::policy_by_value_interface::stable_sort_pairs<ExecPolicy>(
       ExecPolicy(), r, std::forward<Args>(args)...);
 }
+
 ///
-template <typename ExecPolicy, typename Res, typename... Args>
+template<typename ExecPolicy, typename Res, typename... Args>
 concepts::enable_if_t<resources::EventProxy<Res>,
                       type_traits::is_execution_policy<ExecPolicy>,
                       type_traits::is_resource<Res>>
-stable_sort_pairs(Res r, Args &&... args)
+stable_sort_pairs(Res r, Args&&... args)
 {
   return ::RAJA::policy_by_value_interface::stable_sort_pairs(
       ExecPolicy(), r, std::forward<Args>(args)...);
diff --git a/include/RAJA/pattern/synchronize.hpp b/include/RAJA/pattern/synchronize.hpp
index d3e42af81c..c812831349 100644
--- a/include/RAJA/pattern/synchronize.hpp
+++ b/include/RAJA/pattern/synchronize.hpp
@@ -38,10 +38,10 @@ namespace RAJA
  * \see RAJA::policy::omp::synchronize_impl
  * \see RAJA::policy::cuda::synchronize_impl
  */
-template <typename Policy>
+template<typename Policy>
 void synchronize()
 {
-  synchronize_impl(Policy{});
+  synchronize_impl(Policy {});
 }
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/MatrixRegister.hpp b/include/RAJA/pattern/tensor/MatrixRegister.hpp
index 9fa39f34ee..85410736ac 100644
--- a/include/RAJA/pattern/tensor/MatrixRegister.hpp
+++ b/include/RAJA/pattern/tensor/MatrixRegister.hpp
@@ -23,30 +23,31 @@
 #include "RAJA/policy/tensor/arch.hpp"
 #include "RAJA/pattern/tensor/TensorRegister.hpp"
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename LAYOUT, typename REGISTER_POLICY = default_register>
-  using SquareMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem,
-                                   RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem>>;
-
-  template<typename T, typename LAYOUT, camp::idx_t ROWS, camp::idx_t COLS,
-           typename REGISTER_POLICY = default_register>
-  using RectMatrixRegister =
-      TensorRegister<REGISTER_POLICY,
-                     T,
-                     LAYOUT,
-                     camp::idx_seq<ROWS,COLS>>;
-
-} // namespace expt
+template<typename T,
+         typename LAYOUT,
+         typename REGISTER_POLICY = default_register>
+using SquareMatrixRegister = TensorRegister<
+    REGISTER_POLICY,
+    T,
+    LAYOUT,
+    camp::idx_seq<
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem,
+        RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem>>;
+
+template<typename T,
+         typename LAYOUT,
+         camp::idx_t ROWS,
+         camp::idx_t COLS,
+         typename REGISTER_POLICY = default_register>
+using RectMatrixRegister =
+    TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<ROWS, COLS>>;
+
+}  // namespace expt
 }  // namespace RAJA
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/ScalarRegister.hpp b/include/RAJA/pattern/tensor/ScalarRegister.hpp
index f6675b4ba9..1e93e8aec2 100644
--- a/include/RAJA/pattern/tensor/ScalarRegister.hpp
+++ b/include/RAJA/pattern/tensor/ScalarRegister.hpp
@@ -22,22 +22,19 @@
 #include "RAJA/pattern/tensor/VectorRegister.hpp"
 #include "RAJA/policy/tensor/arch.hpp"
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  // Convenience to describe ScalarTensors
-  template<typename T>
-  using ScalarRegister = TensorRegister<scalar_register,
-                                        T,
-                                        ScalarLayout,
-                                        camp::idx_seq<>>;
+// Convenience to describe ScalarTensors
+template<typename T>
+using ScalarRegister =
+    TensorRegister<scalar_register, T, ScalarLayout, camp::idx_seq<>>;
 
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorBlock.hpp b/include/RAJA/pattern/tensor/TensorBlock.hpp
index 0e9869a772..6fc9d48897 100644
--- a/include/RAJA/pattern/tensor/TensorBlock.hpp
+++ b/include/RAJA/pattern/tensor/TensorBlock.hpp
@@ -360,7 +360,6 @@ namespace ET{
 }  // namespace RAJA
 
 
-
 #endif
 
 #endif
diff --git a/include/RAJA/pattern/tensor/TensorIndex.hpp b/include/RAJA/pattern/tensor/TensorIndex.hpp
index 8f152d92ce..7a1105d7a9 100644
--- a/include/RAJA/pattern/tensor/TensorIndex.hpp
+++ b/include/RAJA/pattern/tensor/TensorIndex.hpp
@@ -22,203 +22,204 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/index/IndexValue.hpp"
 
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndexInner;
-
-  template<typename INNER_TYPE>
-  struct StaticTensorIndex;
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-  class TensorIndex {
-    public:
-      using self_type = TensorIndex<IDX, TENSOR_TYPE, DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type all(){
-        return self_type(index_type(-1), value_type(-1));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>> static_all(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,value_type(-1),value_type(-1)>>();
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      self_type range(index_type begin, index_type end){
-        return self_type(begin, value_type(stripIndexType(end-begin)));
-      }
-
-      template<value_type TBEGIN, value_type TEND>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>> static_range(){
-        return StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,TBEGIN,TEND-TBEGIN>>();
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex() : m_index(index_type(0)), m_length(0) {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(RAJA::TypedRangeSegment<IDX> const &seg) :
-      m_index(*seg.begin()), m_length(seg.size())
-      {}
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(index_type value, value_type length) : m_index(value), m_length(length) {}
-
-      template<typename T, camp::idx_t D>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(TensorIndex<IDX, T, D> const &c) : m_index(*c), m_length(c.size()) {}
-
-
-      template<value_type IDX_VAL, value_type LEN_VAL>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      TensorIndex(StaticTensorIndex<StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const RAJA_UNUSED_ARG(&c))
-          : m_index(IDX_VAL)
-          , m_length(LEN_VAL)
-      {}
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type const &operator*() const {
-        return m_index;
-      }
-
-      // used in strip_by_value as a static cast
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      explicit operator index_type() const {
-        // return does not matter, but suppresses no-return warnings
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      index_type begin() const {
-        return m_index;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type size() const {
-        return m_length;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      value_type dim() const {
-        return DIM;
-      }
-
-    private:
-      index_type m_index;
-      value_type m_length;
-  };
-
-
-  template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, strip_index_type_t<IDX> INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-  struct StaticTensorIndex<StaticTensorIndexInner<IDX,TENSOR_TYPE,DIM,INDEX_VALUE,LENGTH_VALUE>> {
-
-      using base_type  = TensorIndex<IDX,TENSOR_TYPE,DIM>;
-      using value_type = strip_index_type_t<IDX>;
-      using index_type = IDX;
-      using tensor_type = TENSOR_TYPE;
-
-      static const index_type s_index  = INDEX_VALUE;
-      static const index_type s_length = LENGTH_VALUE;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr operator base_type() {
-        return base_type(s_index,s_length);
-      }
-    
-  };
-
-
-
-  /*!
-   * Index that specifies the starting element index of a Vector
-   */
-  template<typename IDX, typename VECTOR_TYPE>
-  using VectorIndex =  TensorIndex<IDX, VECTOR_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Row index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using RowIndex =  TensorIndex<IDX, MATRIX_TYPE, 0>;
-
-  /*!
-   * Index that specifies the starting Column index of a matrix
-   */
-  template<typename IDX, typename MATRIX_TYPE>
-  using ColIndex =  TensorIndex<IDX, MATRIX_TYPE, 1>;
-
-
-  /*!
-   * Converts a Row index to a Column index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+template<typename IDX,
+         typename TENSOR_TYPE,
+         camp::idx_t DIM,
+         strip_index_type_t<IDX> INDEX_VALUE,
+         strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndexInner;
+
+template<typename INNER_TYPE>
+struct StaticTensorIndex;
+
+template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+class TensorIndex
+{
+public:
+  using self_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr self_type all()
+  {
+    return self_type(index_type(-1), value_type(-1));
+  }
+
+  RAJA_INLINE
+
   RAJA_HOST_DEVICE
+  static constexpr StaticTensorIndex<StaticTensorIndexInner<IDX,
+                                                            TENSOR_TYPE,
+                                                            DIM,
+                                                            value_type(-1),
+                                                            value_type(-1)>>
+  static_all()
+  {
+    return StaticTensorIndex<StaticTensorIndexInner<
+        IDX, TENSOR_TYPE, DIM, value_type(-1), value_type(-1)>>();
+  }
+
   RAJA_INLINE
-  constexpr
-  ColIndex<IDX, MATRIX_TYPE> toColIndex(RowIndex<IDX, MATRIX_TYPE> const &r){
-    return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+
+  RAJA_HOST_DEVICE
+  static constexpr self_type range(index_type begin, index_type end)
+  {
+    return self_type(begin, value_type(stripIndexType(end - begin)));
+  }
+
+  template<value_type TBEGIN, value_type TEND>
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr StaticTensorIndex<
+      StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>
+  static_range()
+  {
+    return StaticTensorIndex<
+        StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, TBEGIN, TEND - TBEGIN>>();
   }
 
-  /*!
-   * Converts a Column index to a Row index
-   */
-  template<typename IDX, typename MATRIX_TYPE>
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex() : m_index(index_type(0)), m_length(0) {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr TensorIndex(RAJA::TypedRangeSegment<IDX> const& seg)
+      : m_index(*seg.begin()),
+        m_length(seg.size())
+  {}
+
+  RAJA_INLINE
+
   RAJA_HOST_DEVICE
+  constexpr TensorIndex(index_type value, value_type length)
+      : m_index(value),
+        m_length(length)
+  {}
+
+  template<typename T, camp::idx_t D>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      TensorIndex<IDX, T, D> const& c)
+      : m_index(*c),
+        m_length(c.size())
+  {}
+
+  template<value_type IDX_VAL, value_type LEN_VAL>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorIndex(
+      StaticTensorIndex<
+          StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, IDX_VAL, LEN_VAL>> const
+          RAJA_UNUSED_ARG(&c))
+      : m_index(IDX_VAL),
+        m_length(LEN_VAL)
+  {}
+
   RAJA_INLINE
-  constexpr
-  RowIndex<IDX, MATRIX_TYPE> toRowIndex(ColIndex<IDX, MATRIX_TYPE> const &c){
-    return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+
+  RAJA_HOST_DEVICE
+  constexpr index_type const& operator*() const { return m_index; }
+
+  // used in strip_by_value as a static cast
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr explicit operator index_type() const
+  {
+    // return does not matter, but suppresses no-return warnings
+    return m_index;
   }
 
-} // namespace expt
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type begin() const { return m_index; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr value_type size() const { return m_length; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr value_type dim() const { return DIM; }
+
+private:
+  index_type m_index;
+  value_type m_length;
+};
+
+template<typename IDX,
+         typename TENSOR_TYPE,
+         camp::idx_t DIM,
+         strip_index_type_t<IDX> INDEX_VALUE,
+         strip_index_type_t<IDX> LENGTH_VALUE>
+struct StaticTensorIndex<
+    StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>
+{
+
+  using base_type   = TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using value_type  = strip_index_type_t<IDX>;
+  using index_type  = IDX;
+  using tensor_type = TENSOR_TYPE;
+
+  static const index_type s_index  = INDEX_VALUE;
+  static const index_type s_length = LENGTH_VALUE;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr operator base_type() { return base_type(s_index, s_length); }
+};
+
+/*!
+ * Index that specifies the starting element index of a Vector
+ */
+template<typename IDX, typename VECTOR_TYPE>
+using VectorIndex = TensorIndex<IDX, VECTOR_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Row index of a matrix
+ */
+template<typename IDX, typename MATRIX_TYPE>
+using RowIndex = TensorIndex<IDX, MATRIX_TYPE, 0>;
+
+/*!
+ * Index that specifies the starting Column index of a matrix
+ */
+template<typename IDX, typename MATRIX_TYPE>
+using ColIndex = TensorIndex<IDX, MATRIX_TYPE, 1>;
+
+/*!
+ * Converts a Row index to a Column index
+ */
+template<typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr ColIndex<IDX, MATRIX_TYPE> toColIndex(
+    RowIndex<IDX, MATRIX_TYPE> const& r)
+{
+  return ColIndex<IDX, MATRIX_TYPE>(*r, r.size());
+}
+
+/*!
+ * Converts a Column index to a Row index
+ */
+template<typename IDX, typename MATRIX_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr RowIndex<IDX, MATRIX_TYPE> toRowIndex(
+    ColIndex<IDX, MATRIX_TYPE> const& c)
+{
+  return RowIndex<IDX, MATRIX_TYPE>(*c, c.size());
+}
+
+}  // namespace expt
 }  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorIndexTraits.hpp"
diff --git a/include/RAJA/pattern/tensor/TensorLayout.hpp b/include/RAJA/pattern/tensor/TensorLayout.hpp
index 376d6b905a..c6a71584d5 100644
--- a/include/RAJA/pattern/tensor/TensorLayout.hpp
+++ b/include/RAJA/pattern/tensor/TensorLayout.hpp
@@ -28,67 +28,58 @@ namespace expt
 {
 
 
-  template<camp::idx_t ... DIM_SEQ>
-  struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
-  {
+template<camp::idx_t... DIM_SEQ>
+struct TensorLayout : public camp::idx_seq<DIM_SEQ...>
+{
 
-      using seq_t = camp::idx_seq<DIM_SEQ...>;
+  using seq_t = camp::idx_seq<DIM_SEQ...>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return false;
-      }
+  RAJA_INLINE
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return false;
-      }
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major() { return false; }
 
-  };
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major() { return false; }
+};
 
-  // specialization for Matrix layouts, where column vs row major matters
-  template<camp::idx_t S2, camp::idx_t S1>
-  struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
-  {
-      using seq_t = camp::idx_seq<S2, S1>;
+// specialization for Matrix layouts, where column vs row major matters
+template<camp::idx_t S2, camp::idx_t S1>
+struct TensorLayout<S2, S1> : public camp::idx_seq<S2, S1>
+{
+  using seq_t = camp::idx_seq<S2, S1>;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_column_major(){
-        return S1 == 0; // Rows are stride-1
-      }
+  RAJA_INLINE
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      bool is_row_major(){
-        return S1 == 1; // Columns are stride-1
-      }
-  };
+  RAJA_HOST_DEVICE
+  static constexpr bool is_column_major()
+  {
+    return S1 == 0;  // Rows are stride-1
+  }
+
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static constexpr bool is_row_major()
+  {
+    return S1 == 1;  // Columns are stride-1
+  }
+};
 
-  // 0d tensor (scalar) layout
-  using ScalarLayout = TensorLayout<>;
+// 0d tensor (scalar) layout
+using ScalarLayout = TensorLayout<>;
 
-  // 1d tensor (vector) layout
-  using VectorLayout = TensorLayout<0>;
+// 1d tensor (vector) layout
+using VectorLayout = TensorLayout<0>;
 
-  // 2d tensor (matrix) layouts
-  using RowMajorLayout = TensorLayout<0, 1>;
-  using ColMajorLayout = TensorLayout<1, 0>;
+// 2d tensor (matrix) layouts
+using RowMajorLayout = TensorLayout<0, 1>;
+using ColMajorLayout = TensorLayout<1, 0>;
 
 
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/TensorRegister.hpp b/include/RAJA/pattern/tensor/TensorRegister.hpp
index d410f46fb7..b186cbd92a 100644
--- a/include/RAJA/pattern/tensor/TensorRegister.hpp
+++ b/include/RAJA/pattern/tensor/TensorRegister.hpp
@@ -28,82 +28,91 @@
 
 namespace RAJA
 {
-namespace internal {
-namespace expt {
-    class TensorRegisterConcreteBase;
-}
+namespace internal
+{
+namespace expt
+{
+class TensorRegisterConcreteBase;
 }
+}  // namespace internal
 
 namespace expt
 {
 
 
-  template<typename REGISTER_POLICY,
-           typename T,
-           typename LAYOUT,
-           typename SIZES>
-  class TensorRegister;
+template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename SIZES>
+class TensorRegister;
 
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic - TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
+
+/*
+ * Overload for:    arithmetic - TensorRegister
+
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
-
-  /*
-   * Overload for:    arithmetic * TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
+
+/*
+ * Overload for:    arithmetic * TensorRegister
+
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
-
-  /*
-   * Overload for:    arithmetic / TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
+
+/*
+ * Overload for:    arithmetic / TensorRegister
+
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-} // namespace expt
-}  // namespace RAJA
+    typename std::enable_if<
+        std::is_base_of<RAJA::internal::expt::TensorRegisterConcreteBase,
+                        RIGHT>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
+}  // namespace expt
+}  // namespace RAJA
 
 #include "RAJA/pattern/tensor/internal/TensorRegisterBase.hpp"
 
diff --git a/include/RAJA/pattern/tensor/VectorRegister.hpp b/include/RAJA/pattern/tensor/VectorRegister.hpp
index afab05658f..d4e0002ebd 100644
--- a/include/RAJA/pattern/tensor/VectorRegister.hpp
+++ b/include/RAJA/pattern/tensor/VectorRegister.hpp
@@ -24,16 +24,15 @@ namespace RAJA
 {
 namespace expt
 {
-  // Convenience to describe VectorTensors
-  template<typename T, typename REGISTER_POLICY = default_register, camp::idx_t NUM_ELEM = Register<T,REGISTER_POLICY>::s_num_elem>
-  using VectorRegister = TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        VectorLayout,
-                                        camp::idx_seq<NUM_ELEM> >;
-} // namespace expt
-
-} // namespace RAJA
-
+// Convenience to describe VectorTensors
+template<typename T,
+         typename REGISTER_POLICY = default_register,
+         camp::idx_t NUM_ELEM     = Register<T, REGISTER_POLICY>::s_num_elem>
+using VectorRegister =
+    TensorRegister<REGISTER_POLICY, T, VectorLayout, camp::idx_seq<NUM_ELEM>>;
+}  // namespace expt
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
index 953f4fd4a0..e195494f71 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperator.hpp
@@ -25,7 +25,6 @@
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -34,132 +33,149 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator
+    : public TensorExpressionBase<
+          TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
+{
+public:
+  using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
+  using operator_type      = OPERATOR;
+  using left_operand_type  = LEFT_OPERAND;
+  using right_operand_type = RIGHT_OPERAND;
+
+  using element_type = typename LEFT_OPERAND::element_type;
+  using index_type   = typename LEFT_OPERAND::index_type;
+
+  using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
+  using result_type     = typename operator_traits::result_type;
+
+  static constexpr camp::idx_t s_num_dims = operator_traits::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorBinaryOperator(left_operand_type const& left,
+                       right_operand_type const& right)
+      : m_left_operand {left},
+        m_right_operand {right}
+  {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr auto getDimSize(camp::idx_t dim) const
+      -> decltype(operator_traits::getDimSize(dim,
+                                              m_left_operand,
+                                              m_right_operand))
+  {
+    return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(operator_type::eval(m_left_operand.eval(tile),
+                                      m_right_operand.eval(tile)))
   {
+    return operator_type::eval(m_left_operand.eval(tile),
+                               m_right_operand.eval(tile));
+  }
 
+  RAJA_INLINE
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator :
-        public TensorExpressionBase<TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>>
-    {
-      public:
-        using self_type = TensorBinaryOperator<OPERATOR, LEFT_OPERAND, RIGHT_OPERAND>;
-        using operator_type = OPERATOR;
-        using left_operand_type = LEFT_OPERAND;
-        using right_operand_type = RIGHT_OPERAND;
-
-        using element_type = typename LEFT_OPERAND::element_type;
-        using index_type = typename LEFT_OPERAND::index_type;
-
-        using operator_traits = OperatorTraits<LEFT_OPERAND, RIGHT_OPERAND>;
-        using result_type = typename operator_traits::result_type;
-
-        static constexpr camp::idx_t s_num_dims =
-            operator_traits::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorBinaryOperator(left_operand_type const &left, right_operand_type const &right) :
-        m_left_operand{left}, m_right_operand{right}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        auto getDimSize(camp::idx_t dim) const ->
-        decltype(operator_traits::getDimSize(dim, m_left_operand, m_right_operand))
-        {
-          return operator_traits::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile)))
-        {
-          return operator_type::eval(m_left_operand.eval(tile), m_right_operand.eval(tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          operator_type::print_ast();
-          printf("[");
-          operator_type::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-
-
-    /*
-     * Overload for:    arithmetic + tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator+(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
-
-
-    /*
-     * Overload for:    arithmetic - tensorexpression
-
-     */
-    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator-(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-    TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
-    {
-      return TensorSubtract<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
-    }
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    operator_type::print_ast();
+    printf("[");
+    operator_type::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+/*
+ * Overload for:    arithmetic + tensorexpression
 
+ */
+template<typename LEFT_OPERAND,
+         typename RIGHT_OPERAND,
+         typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                 bool>::type = true,
+         typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                 RIGHT_OPERAND>::value,
+                                 bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator+(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                 RIGHT_OPERAND>
+{
+  return TensorAdd<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+                   RIGHT_OPERAND>(
+      NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+}
+
+/*
+ * Overload for:    arithmetic - tensorexpression
+
+ */
+template<typename LEFT_OPERAND,
+         typename RIGHT_OPERAND,
+         typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+                                 bool>::type = true,
+         typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+                                                 RIGHT_OPERAND>::value,
+                                 bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator-(LEFT_OPERAND const& left,
+                                            RIGHT_OPERAND const& right)
+    -> TensorSubtract<
+        typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+        RIGHT_OPERAND>
+{
+  return TensorSubtract<
+      typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+                     right);
+}
 
 //    /*
 //     * Overload for:    arithmetic / tensorexpression
 //
 //     */
 //    template<typename LEFT_OPERAND, typename RIGHT_OPERAND,
-//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value, bool>::type = true,
-//      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RIGHT_OPERAND>::value, bool>::type = true>
+//      typename std::enable_if<std::is_arithmetic<LEFT_OPERAND>::value,
+//      bool>::type = true, typename
+//      std::enable_if<std::is_base_of<TensorExpressionConcreteBase,
+//      RIGHT_OPERAND>::value, bool>::type = true>
 //    RAJA_INLINE
 //    RAJA_HOST_DEVICE
 //    auto operator/(LEFT_OPERAND const &left, RIGHT_OPERAND const &right) ->
-//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>
+//    TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//    RIGHT_OPERAND>
 //    {
-//      return TensorDivide<typename NormalizeOperandHelper<LEFT_OPERAND>::return_type, RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left), right);
+//      return TensorDivide<typename
+//      NormalizeOperandHelper<LEFT_OPERAND>::return_type,
+//      RIGHT_OPERAND>(NormalizeOperandHelper<LEFT_OPERAND>::normalize(left),
+//      right);
 //    }
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
index a1450bf19f..13408f7b26 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp
@@ -27,159 +27,139 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+struct TensorOperatorAdd
+{
+
+  template<typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left + right)
+  {
+    return left + right;
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Add"); }
+};
+
+struct TensorOperatorSubtract
+{
+
+  template<typename LEFT, typename RIGHT>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto eval(LEFT const& left,
+                                                RIGHT const& right)
+      -> decltype(left - right)
   {
+    return left - right;
+  }
 
-    struct TensorOperatorAdd
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left + right)
-      {
-        return left + right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Add");
-      }
-    };
-
-    struct TensorOperatorSubtract
-    {
-
-      template<typename LEFT, typename RIGHT>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      auto eval(LEFT const &left, RIGHT const &right) ->
-        decltype(left - right)
-      {
-        return left - right;
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast(){
-        printf("Subtract");
-      }
-    };
-
-
-
-
+  RAJA_INLINE
 
-    template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
-    class TensorBinaryOperator;
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Subtract"); }
+};
 
-    template<typename LHS, typename RHS>
-    using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
-    template<typename LHS, typename RHS>
-    using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
+template<typename OPERATOR, typename LEFT_OPERAND, typename RIGHT_OPERAND>
+class TensorBinaryOperator;
 
+template<typename LHS, typename RHS>
+using TensorAdd = TensorBinaryOperator<TensorOperatorAdd, LHS, RHS>;
 
+template<typename LHS, typename RHS>
+using TensorSubtract = TensorBinaryOperator<TensorOperatorSubtract, LHS, RHS>;
 
+/*!
+ * Provides default operations for add, subtract and divide
+ *
+ * For the most part, this is just element wise operations between
+ * compatible tensors.
+ *
+ * There are specializations that handle when one operand is a scalar
+ */
+template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
+struct OperatorTraits
+{
 
-    /*!
-     * Provides default operations for add, subtract and divide
-     *
-     * For the most part, this is just element wise operations between
-     * compatible tensors.
-     *
-     * There are specializations that handle when one operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE, class ENABLE = void>
-    struct OperatorTraits {
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+  RAJA_INLINE
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental");
-        }
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Elemental"); }
 
+  RAJA_INLINE
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &rhs) {
-          return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
-        }
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const& rhs)
+  {
+    return dim == 0 ? lhs.getDimSize(0) : rhs.getDimSize(1);
+  }
+};
 
-    };
+/*!
+ * Specialization when the left operand is a scalar
+ */
+template<typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
+{
 
-    /*!
-     * Specialization when the left operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<LHS_TYPE::s_num_dims == 0>::type>
-    {
+  using result_type                       = typename RHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
 
-        using result_type = typename RHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RHS_TYPE::s_num_dims;
+  RAJA_INLINE
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &, RHS_TYPE const &rhs) {
-          return rhs.getDimSize(dim);
-        }
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
-    };
+  RAJA_INLINE
 
-    /*!
-     * Specialization when the right operand is a scalar
-     */
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    struct OperatorTraits<LHS_TYPE, RHS_TYPE,
-    typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
-    {
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const&, RHS_TYPE const& rhs)
+  {
+    return rhs.getDimSize(dim);
+  }
+};
 
-        using result_type = typename LHS_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
+/*!
+ * Specialization when the right operand is a scalar
+ */
+template<typename LHS_TYPE, typename RHS_TYPE>
+struct OperatorTraits<LHS_TYPE,
+                      RHS_TYPE,
+                      typename std::enable_if<RHS_TYPE::s_num_dims == 0>::type>
+{
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scalar");
-        }
+  using result_type                       = typename LHS_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LHS_TYPE::s_num_dims;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LHS_TYPE const &lhs, RHS_TYPE const &) {
-          return lhs.getDimSize(dim);
-        }
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scalar"); }
 
+  RAJA_INLINE
 
-    };
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim, LHS_TYPE const& lhs, RHS_TYPE const&)
+  {
+    return lhs.getDimSize(dim);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
index 210414eaec..07278c3a66 100644
--- a/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/BlockLiteral.hpp
@@ -25,8 +25,6 @@
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
-
-
 namespace RAJA
 {
 namespace internal
@@ -35,93 +33,92 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Temporary n-dimensional memory.
-     *
-     * STORAGE_TYPE defines the memory storage
-     * TENSOR_TYPE defines what kind of tensor is returned by eval()
-     */
-    template<typename STORAGE_TYPE, typename TENSOR_TYPE>
-    class BlockLiteral :  public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>> {
-      public:
-        using self_type = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
-        using storage_type = STORAGE_TYPE;
-        using tensor_type = TENSOR_TYPE;
-        using result_type = TENSOR_TYPE;
-        using ref_type = typename STORAGE_TYPE::ref_type;
-        using tile_type = typename ref_type::tile_type;
-        using index_type = camp::idx_t;
+namespace ET
+{
 
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
+/*!
+ * Temporary n-dimensional memory.
+ *
+ * STORAGE_TYPE defines the memory storage
+ * TENSOR_TYPE defines what kind of tensor is returned by eval()
+ */
+template<typename STORAGE_TYPE, typename TENSOR_TYPE>
+class BlockLiteral
+    : public TensorExpressionBase<BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>>
+{
+public:
+  using self_type    = BlockLiteral<STORAGE_TYPE, TENSOR_TYPE>;
+  using storage_type = STORAGE_TYPE;
+  using tensor_type  = TENSOR_TYPE;
+  using result_type  = TENSOR_TYPE;
+  using ref_type     = typename STORAGE_TYPE::ref_type;
+  using tile_type    = typename ref_type::tile_type;
+  using index_type   = camp::idx_t;
 
-      private:
-        storage_type m_storage;
-        tile_type m_tile_origin;
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return storage_type::s_dim_elem(dim);
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        BlockLiteral(tile_type tile_origin) :
-          m_storage(),
-          m_tile_origin(tile_origin)
-        {
+private:
+  storage_type m_storage;
+  tile_type m_tile_origin;
 
-        }
+public:
+  RAJA_INLINE
 
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          result_type result;
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return storage_type::s_dim_elem(dim);
+  }
 
-          // load result from storage
-          result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
+  RAJA_INLINE
 
-          return result;
-        }
+  RAJA_HOST_DEVICE
+  constexpr BlockLiteral(tile_type tile_origin)
+      : m_storage(),
+        m_tile_origin(tile_origin)
+  {}
 
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    result_type result;
 
-        /*!
-         *  Returns a ref that points at this data, shifted by its origin
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        ref_type get_ref() {
+    // load result from storage
+    result.load_ref(merge_ref_tile(m_storage.get_ref(), tile - m_tile_origin));
 
-          // compute shifited origin ref
-          return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+    return result;
+  }
 
-        }
+  /*!
+   *  Returns a ref that points at this data, shifted by its origin
+   */
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  ref_type get_ref()
+  {
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("BlockLiteral()");
-        }
+    // compute shifited origin ref
+    return shift_tile_origin(m_storage.get_ref(), m_tile_origin);
+  }
 
-    };
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("BlockLiteral()"); }
+};
 
 //    /*
-//     * For TensorRegister nodes, we need to wrap this in a constant value ET node
+//     * For TensorRegister nodes, we need to wrap this in a constant value ET
+//     node
 //     */
 //    template<typename RHS>
 //    struct NormalizeOperandHelper<RHS,
-//    typename std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase, RHS>::value>::type>
+//    typename
+//    std::enable_if<std::is_base_of<RAJA::internal::TensorRegisterConcreteBase,
+//    RHS>::value>::type>
 //    {
 //        using return_type = BlockLiteral<RHS>;
 //
@@ -134,10 +131,10 @@ namespace expt
 //        }
 //    };
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
index 3e96a63462..02fa6dff6a 100644
--- a/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp
@@ -27,8 +27,7 @@
 #include "RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp"
 #include "RAJA/pattern/tensor/internal/ET/BinaryOperatorTraits.hpp"
 
-
-//#define RAJA_DEBUG_PRINT_ET_AST
+// #define RAJA_DEBUG_PRINT_ET_AST
 
 namespace RAJA
 {
@@ -38,128 +37,124 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
+class TensorRegisterConcreteBase;
+
+namespace ET
+{
+
+//
+// forward decls
+//
+
+template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
+class TensorLoadStore;
+
+
+template<typename LHS_TYPE, typename RHS_TYPE>
+class TensorMultiply;
+
+template<typename LHS_TYPE, typename RHS_TYPE>
+class TensorDivide;
+
+template<typename TENSOR_TYPE>
+class TensorNegate;
+
+template<typename TENSOR_TYPE>
+class TensorTranspose;
+
+// provides a non-templated base-type for all ET's
+// this allows using things like std::is_base_of
+class TensorExpressionConcreteBase
+{};
+
+template<typename DERIVED_TYPE>
+class TensorExpressionBase : public TensorExpressionConcreteBase
+{
+public:
+  using self_type = DERIVED_TYPE;
+
+private:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type* getThis() { return static_cast<self_type*>(this); }
 
-  namespace ET
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr camp::idx_t getDimBegin(camp::idx_t) const { return 0; }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorAdd<self_type, normalize_operand_t<RHS>>
+  operator+(RHS const& rhs) const
+  {
+    return TensorAdd<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE
+      TensorSubtract<self_type, normalize_operand_t<RHS>>
+      operator-(RHS const& rhs) const
+  {
+    return TensorSubtract<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorNegate<self_type> operator-() const
+  {
+    return TensorNegate<self_type>(*getThis());
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE
+      TensorMultiply<self_type, normalize_operand_t<RHS>>
+      operator*(RHS const& rhs) const
+  {
+    return TensorMultiply<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorDivide<self_type, normalize_operand_t<RHS>>
+  operator/(RHS const& rhs) const
+  {
+    return TensorDivide<self_type, normalize_operand_t<RHS>>(
+        *getThis(), normalizeOperand(rhs));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorTranspose<self_type> transpose() const
   {
+    return TensorTranspose<self_type>(*getThis());
+  }
+};
+
+
+}  // namespace ET
 
-    //
-    // forward decls
-    //
-
-    template<typename TENSOR_REGISTER_TYPE, typename REF_TYPE>
-    class TensorLoadStore;
-
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorMultiply;
-
-    template<typename LHS_TYPE, typename RHS_TYPE>
-    class TensorDivide;
-
-    template<typename TENSOR_TYPE>
-    class TensorNegate;
-
-    template<typename TENSOR_TYPE>
-    class TensorTranspose;
-
-
-
-
-    // provides a non-templated base-type for all ET's
-    // this allows using things like std::is_base_of
-    class TensorExpressionConcreteBase{};
-
-
-    template<typename DERIVED_TYPE>
-    class TensorExpressionBase :public TensorExpressionConcreteBase {
-      public:
-        using self_type = DERIVED_TYPE;
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        self_type *getThis(){
-          return static_cast<self_type*>(this);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        self_type const *getThis() const {
-          return static_cast<self_type const*>(this);
-        }
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        camp::idx_t getDimBegin(camp::idx_t ) const
-        {
-          return 0;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorAdd<self_type, normalize_operand_t<RHS> >
-        operator+(RHS const &rhs) const {
-          return TensorAdd<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorSubtract<self_type, normalize_operand_t<RHS>>
-        operator-(RHS const &rhs) const {
-          return TensorSubtract<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate<self_type>
-        operator-() const {
-          return TensorNegate<self_type>(*getThis());
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply<self_type, normalize_operand_t<RHS>>
-        operator*(RHS const &rhs) const {
-          return TensorMultiply<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide<self_type, normalize_operand_t<RHS>>
-        operator/(RHS const &rhs) const {
-          return TensorDivide<self_type, normalize_operand_t<RHS>>(*getThis(), normalizeOperand(rhs));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose<self_type>
-        transpose() const {
-          return TensorTranspose<self_type>(*getThis());
-        }
-
-    };
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
index e7e7223ce4..e073c561ae 100644
--- a/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp
@@ -19,1211 +19,1238 @@
 #ifndef RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 #define RAJA_pattern_tensor_ET_MultiplyOperator_HPP
 
-
-
 namespace RAJA
 {
 namespace internal
 {
 namespace expt
 {
-  //forward
-  class TensorBlockConcreteBase;
+// forward
+class TensorBlockConcreteBase;
+
+namespace ET
+{
 
 
+/*!
+ * Provides default multiply, multiply add, and multiply subtract
+ * operations.
+ *
+ * If the operands are both matrices, we perform a matrix-matrix multiply.
+ * Otherwise, we perform element-wise operations.
+ */
+template<typename LEFT_OPERAND_TYPE,
+         typename RIGHT_OPERAND_TYPE,
+         class ENABLE = void>
+struct MultiplyOperator
+{
+
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
+  RAJA_INLINE
 
-  namespace ET
+  RAJA_HOST_DEVICE
+  static void print_ast()
   {
+    printf("Elemental(%d,%d)", (int)s_num_dims,
+           (int)RIGHT_OPERAND_TYPE::s_num_dims);
+  }
 
+  RAJA_INLINE
 
-    /*!
-     * Provides default multiply, multiply add, and multiply subtract
-     * operations.
-     *
-     * If the operands are both matrices, we perform a matrix-matrix multiply.
-     * Otherwise, we perform element-wise operations.
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct MultiplyOperator
-    {
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile) * right.eval(tile))
+  {
+    return left.eval(tile) * right.eval(tile);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).multiply_add(right.eval(tile),
+                                               add.eval(tile)))
+  {
+    return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).multiply_subtract(right.eval(tile),
+                                                    subtract.eval(tile)))
+  {
+    return left.eval(tile).multiply_subtract(right.eval(tile),
+                                             subtract.eval(tile));
+  }
+};
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Elemental(%d,%d)", (int)s_num_dims, (int)RIGHT_OPERAND_TYPE::s_num_dims);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile) * right.eval(tile))
-        {
-          return left.eval(tile) * right.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).multiply_add(right.eval(tile), add.eval(tile)))
-        {
-          return left.eval(tile).multiply_add(right.eval(tile), add.eval(tile));
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile)))
-        {
-          return left.eval(tile).multiply_subtract(right.eval(tile), subtract.eval(tile));
-        }
-
-
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a scalar * tensor
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+/*!
+ * Specialization that provides multiplying a scalar * tensor
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0>::type>
-    {
+{
 
-        using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-          return right.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(right.eval(tile).scale(left.eval(tile)))
-        {
-          return right.eval(tile).scale(left.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
-        {
-          return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization that provides multiplying a tensor*scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const&,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+      -> decltype(right.eval(tile).scale(left.eval(tile)))
+  {
+    return right.eval(tile).scale(left.eval(tile));
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const& add)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) + add.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) + add.eval(tile);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile))
+  {
+    return right.eval(tile).scale(left.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+/*!
+ * Specialization that provides multiplying a tensor*scalar
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
     typename std::enable_if<RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Scale"); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform scaling operation
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+      -> decltype(left.eval(tile).scale(right.eval(tile)))
+  {
+    return left.eval(tile).scale(right.eval(tile));
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply add
+   */
+  template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_OPERAND_TYPE const& add)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply subtract
+   */
+  template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static auto multiply_subtract(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      SUBTRACT_OPERAND_TYPE const& subtract)
+      -> decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
+  {
+    return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
+  }
+};
+
+/*!
+ * Specialization for matrix-vector right multiplication.
+ *
+ * By default the A*x operator for two matrices produces a matrix-vector
+ * multiplication.
+ *
+ * The right hand side vector is always treated as a column vector.
+ *
+ * The resulting vector type is inherited from the RHS
+ *
+ *
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using left_type  = LEFT_OPERAND_TYPE;
+  using right_type = RIGHT_OPERAND_TYPE;
+  using result_type =
+      typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Vector"); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const&,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? right.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+
+    // clear result
+    result_type result(0);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template<typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const& add)
+  {
+
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template<typename STORAGE, typename TILE_TYPE, typename INDEX = void>
+  struct MultiplyBridge;
+
+  template<typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE& result,
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+    // get tile size from matrix type
+    auto tile_size = left_type::result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    auto left_tile =
+        LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    left_tile.m_begin[0] = tile.m_begin[0];
+    left_tile.m_size[0]  = tile.m_size[0];
+    left_tile.m_size[1]  = tile_size;
+
+    using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+    RightType right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
 
-        using result_type = typename LEFT_OPERAND_TYPE::result_type;
-        static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Scale");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-          return left.getDimSize(dim);
-        }
-
-        /*!
-         * Evaluate operands and perform scaling operation
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) ->
-          decltype(left.eval(tile).scale(right.eval(tile)))
-        {
-          return left.eval(tile).scale(right.eval(tile));
-        }
-
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply add
-         */
-        template<typename TILE_TYPE, typename ADD_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_OPERAND_TYPE const &add) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) + add.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) + add.eval(tile);
-        }
-
-
-        /*!
-         * Evaluate operands and perform element-wise multiply subtract
-         */
-        template<typename TILE_TYPE, typename SUBTRACT_OPERAND_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        auto multiply_subtract(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, SUBTRACT_OPERAND_TYPE const &subtract) ->
-          decltype(left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile))
-        {
-          return left.eval(tile).scale(right.eval(tile)) - subtract.eval(tile);
-        }
-    };
-
-
-    /*!
-     * Specialization for matrix-vector right multiplication.
-     *
-     * By default the A*x operator for two matrices produces a matrix-vector
-     * multiplication.
-     *
-     * The right hand side vector is always treated as a column vector.
-     *
-     * The resulting vector type is inherited from the RHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==1>::type>
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k;
+      auto left            = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
+
+      // accumulate product
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
     {
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product of partial tile
+      result = left.right_multiply_vector_accumulate(right, result);
+    }
+  }
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::column_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+  template<typename T>
+  struct Diag
+  {
+    static_assert(!std::is_same<T, void>::value, "diag");
+  };
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Vector");
-      }
+  template<typename I, TensorTileSize TTS, typename B, typename S>
+  struct Diag<StaticTensorTile<I, TTS, B, S>>
+  {
+    static_assert(std::is_same<I, void>::value, "diag");
+  };
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? right.getDimSize(0) : 0;
-      }
+  template<typename STORAGE, typename TILE_TYPE, typename INDEX>
+  struct MultiplyBridge
+  {
+
+    Diag<TILE_TYPE> diag;
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
+    RAJA_INLINE
 
-        // clear result
-        result_type result(0);
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TILE_TYPE const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
+      // using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
+
+      // get tile size from matrix type
+      auto tile_size = left_type::result_type::s_dim_elem(1);
+      auto k_size    = et_left.getDimSize(1);
+      // TODO: check that left and right are compatible
+      // m_left.getDimSize(1) == m_right.getDimSize(0)
+      // how do we provide checking for this kind of error?
+
+      // tile over row of left and column of right
+      auto left_tile =
+          LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+      left_tile.m_begin[0] = tile.m_begin[0];
+      left_tile.m_size[0]  = tile.m_size[0];
+      left_tile.m_size[1]  = tile_size;
+
+      using RightType = typename TILE_TYPE::nonstatic_self_type;
+
+      RightType right_tile = tile;
+      right_tile.m_size[0] = tile_size;
+
+      // Do full tiles in k
+      decltype(k_size) k = 0;
+      for (; k + tile_size <= k_size; k += tile_size)
+      {
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        // evaluate both sides of operator
+        left_tile.m_begin[1] = k;
+        auto left            = et_left.eval(left_tile);
 
-        return result;
+        right_tile.m_begin[0] = k;
+        auto right            = et_right.eval(right_tile);
+
+        // accumulate product
+        result = left.right_multiply_vector_accumulate(right, result);
+      }
+      // remainder tile in k
+      if (k < k_size)
+      {
+        auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+        left_part_tile.m_begin[1] = k;
+        left_part_tile.m_size[1]  = k_size - k;
+        auto left                 = et_left.eval(left_part_tile);
+
+        auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+        right_part_tile.m_begin[0] = k;
+        right_part_tile.m_size[0]  = k_size - k;
+        auto right                 = et_right.eval(right_part_tile);
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template<size_t INDEX,
+           typename STORAGE,
+           typename INDEX_TYPE,
+           TensorTileSize TENSOR_SIZE,
+           INDEX_TYPE Begin0,
+           INDEX_TYPE... BeginTail,
+           INDEX_TYPE Size0,
+           INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, INDEX>>
+  {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
-        // evaluate add into result
-        result_type result = add.eval(tile);
+    RAJA_INLINE
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
 
-        return result;
-      }
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-    private:
+      auto const offset = INDEX * tile_size;
 
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX=void>
-      struct MultiplyBridge;
+      if ((offset + tile_size) <= k_size)
+      {
 
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        MultiplyBridge<STORAGE, TileType,
+                       camp::integral_constant<size_t, INDEX - 1>>::
+            multiply_into_result(result, tile, et_left, et_right);
+        result += temp;
+      }
+      else
       {
-        //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-
-        // get tile size from matrix type
-        auto tile_size = left_type::result_type::s_dim_elem(1);
-        auto k_size = et_left.getDimSize(1);
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        left_tile.m_begin[0] = tile.m_begin[0];
-        left_tile.m_size[0] = tile.m_size[0];
-        left_tile.m_size[1] = tile_size;
-
-        using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-        RightType right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product of partial tile
-          result = left.right_multiply_vector_accumulate(right, result);
-        }
 
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
+
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template<typename STORAGE,
+           typename INDEX_TYPE,
+           TensorTileSize TENSOR_SIZE,
+           INDEX_TYPE Begin0,
+           INDEX_TYPE... BeginTail,
+           INDEX_TYPE Size0,
+           INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      camp::integral_constant<size_t, 0>>
+  {
 
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
 
-      template<typename T>
-      struct Diag{
-          static_assert(!std::is_same<T,void>::value,"diag");
-      };
-
-      template<typename I, TensorTileSize TTS, typename B, typename S>
-      struct Diag< StaticTensorTile<I,TTS,B,S> >{
-          static_assert(std::is_same<I,void>::value,"diag");
-      };
-
-      template<typename STORAGE, typename TILE_TYPE, typename INDEX>
-      struct MultiplyBridge {
-
-          Diag<TILE_TYPE> diag;
-
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-            //using LHS_STORAGE = typename LEFT_OPERAND_TYPE::result_type;
-    
-            // get tile size from matrix type
-            auto tile_size = left_type::result_type::s_dim_elem(1);
-            auto k_size = et_left.getDimSize(1);
-            // TODO: check that left and right are compatible
-            // m_left.getDimSize(1) == m_right.getDimSize(0)
-            // how do we provide checking for this kind of error?
-    
-            // tile over row of left and column of right
-            auto left_tile = LEFT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-            left_tile.m_begin[0] = tile.m_begin[0];
-            left_tile.m_size[0] = tile.m_size[0];
-            left_tile.m_size[1] = tile_size;
-    
-            using RightType = typename TILE_TYPE::nonstatic_self_type;
-
-            RightType right_tile = tile;
-            right_tile.m_size[0] = tile_size;
-    
-            // Do full tiles in k
-            decltype(k_size) k = 0;
-            for(;k+tile_size <= k_size; k+= tile_size){
-    
-              // evaluate both sides of operator
-              left_tile.m_begin[1] = k;
-              auto left = et_left.eval(left_tile);
-    
-              right_tile.m_begin[0] = k;
-              auto right = et_right.eval(right_tile);
-    
-              // accumulate product
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-            // remainder tile in k
-            if(k < k_size){
-              auto &left_part_tile = make_tensor_tile_partial(left_tile);
-              left_part_tile.m_begin[1] = k;
-              left_part_tile.m_size[1] = k_size-k;
-              auto left = et_left.eval(left_part_tile);
-    
-              auto &right_part_tile = make_tensor_tile_partial(right_tile);
-              right_part_tile.m_begin[0] = k;
-              right_part_tile.m_size[0] = k_size-k;
-              auto right = et_right.eval(right_part_tile);
-    
-              // accumulate product of partial tile
-              result = left.right_multiply_vector_accumulate(right, result);
-            }
-    
-          }
-      };
-
-
-
-
-      template<
-          size_t INDEX,
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,INDEX>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = INDEX*tile_size;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,INDEX-1>>::multiply_into_result(result,tile,et_left,et_right);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0, INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0, INDEX_TYPE...  SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          camp::integral_constant<size_t,0>
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              // get tile size from matrix type
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-             
-              auto const offset = 0;
-
-              if( (offset + tile_size) <= k_size ) {
-    
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE, Begin0,    offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, tile_size>
-                    >;
-                    // evaluate both sides of operator
-                    auto left = et_left.eval(LeftType());
-
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_SIZE,
-                        camp::int_seq<INDEX_TYPE,    offset>,
-                        camp::int_seq<INDEX_TYPE, tile_size>
-                    >;
-    
-                    auto right = et_right.eval(RightType());
-    
-                    // accumulate product
-                    auto temp = left.right_multiply_vector_accumulate(right, result);
-                    result += temp;
-                    
-              } else {
-
-                    using LeftType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE, Begin0,        offset>,
-                        camp::int_seq<INDEX_TYPE,  Size0, k_size-offset>
-                    >;
-		    auto left = et_left.eval(LeftType());
-	    
-                    using RightType = StaticTensorTile <
-                        INDEX_TYPE,
-                        TENSOR_PARTIAL,
-                        camp::int_seq<INDEX_TYPE,        offset>,
-                        camp::int_seq<INDEX_TYPE, k_size-offset>
-                    >;
-		    auto right = et_right.eval(RightType());
-	    
-		    // accumulate product of partial tile
-		    result = left.right_multiply_vector_accumulate(right, result);
-
-              }
-
-
-            }
-          };
-
-      template<
-          typename STORAGE,
-          typename INDEX_TYPE,
-          TensorTileSize TENSOR_SIZE,
-          INDEX_TYPE Begin0,  INDEX_TYPE... BeginTail,
-          INDEX_TYPE  Size0,  INDEX_TYPE... SizeTail
-      >
-      struct MultiplyBridge <
-          STORAGE,
-          StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >,
-          void
-      > {
-
-          using TileType = StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
-              camp::int_seq<INDEX_TYPE,  Size0,  SizeTail...>
-          >;
-              
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static
-          void multiply_into_result(STORAGE &result, TileType const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-          {
-
-              const auto tile_size = left_type::result_type::s_dim_elem(1);
-              const auto k_size = et_left.getDimSize(1);
-              const size_t iter_count = (k_size/tile_size) + ( (k_size%tile_size != 0) ? 1 : 0 );
-
-              MultiplyBridge<STORAGE,TileType,camp::integral_constant<size_t,iter_count>>::multiply_into_result(result,tile,et_left,et_right);
-
-            }
-          };
-
-      };
-
-
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd;
-
-
-    /*!
-     * Specialization for vector*matrix left multiplication.
-     *
-     * By default the x'*A operator for two matrices produces a vector-matrix
-     * multiplication.
-     *
-     * The left hand side vector is always treated as a row vector.
-     *
-     * The resulting vector type is inherited from the LHS
-     *
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    RAJA_INLINE
+
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const&,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
     {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
-      static constexpr camp::idx_t s_num_dims = 1;
+      // get tile size from matrix type
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Vector*Matrix");
-      }
+      auto const offset = 0;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return dim == 0 ? left.getDimSize(0) : 0;
-      }
+      if ((offset + tile_size) <= k_size)
+      {
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right){
-        // clear result
-        result_type result(0);
-
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
-
-        return result;
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, tile_size>>;
+        // evaluate both sides of operator
+        auto left = et_left.eval(LeftType());
+
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_SIZE,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, tile_size>>;
+
+        auto right = et_right.eval(RightType());
+
+        // accumulate product
+        auto temp = left.right_multiply_vector_accumulate(right, result);
+        result += temp;
       }
+      else
+      {
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add){
-        // evaluate add into result
-        result_type result = add.eval(tile);
+        using LeftType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, Begin0, offset>,
+                             camp::int_seq<INDEX_TYPE, Size0, k_size - offset>>;
+        auto left = et_left.eval(LeftType());
 
-        // multiply left and right into result
-        multiply_into_result(result, tile, left, right);
+        using RightType =
+            StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL,
+                             camp::int_seq<INDEX_TYPE, offset>,
+                             camp::int_seq<INDEX_TYPE, k_size - offset>>;
+        auto right = et_right.eval(RightType());
 
-        return result;
+        // accumulate product of partial tile
+        result = left.right_multiply_vector_accumulate(right, result);
       }
+    }
+  };
+
+  template<typename STORAGE,
+           typename INDEX_TYPE,
+           TensorTileSize TENSOR_SIZE,
+           INDEX_TYPE Begin0,
+           INDEX_TYPE... BeginTail,
+           INDEX_TYPE Size0,
+           INDEX_TYPE... SizeTail>
+  struct MultiplyBridge<
+      STORAGE,
+      StaticTensorTile<INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                       camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>,
+      void>
+  {
+
+    using TileType =
+        StaticTensorTile<INDEX_TYPE,
+                         TENSOR_SIZE,
+                         camp::int_seq<INDEX_TYPE, Begin0, BeginTail...>,
+                         camp::int_seq<INDEX_TYPE, Size0, SizeTail...>>;
+
+    RAJA_INLINE
+
+    RAJA_HOST_DEVICE
+    static void multiply_into_result(STORAGE& result,
+                                     TileType const& tile,
+                                     LEFT_OPERAND_TYPE const& et_left,
+                                     RIGHT_OPERAND_TYPE const& et_right)
+    {
+
+      const auto tile_size = left_type::result_type::s_dim_elem(1);
+      const auto k_size    = et_left.getDimSize(1);
+      const size_t iter_count =
+          (k_size / tile_size) + ((k_size % tile_size != 0) ? 1 : 0);
+
+      MultiplyBridge<STORAGE, TileType,
+                     camp::integral_constant<size_t, iter_count>>::
+          multiply_into_result(result, tile, et_left, et_right);
+    }
+  };
+};
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        auto tile_size = right_type::result_type::s_dim_elem(0);
-        auto k_size = et_right.getDimSize(0);
 
+template<typename LEFT_OPERAND_TYPE,
+         typename RIGHT_OPERAND_TYPE,
+         typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd;
 
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
+/*!
+ * Specialization for vector*matrix left multiplication.
+ *
+ * By default the x'*A operator for two matrices produces a vector-matrix
+ * multiplication.
+ *
+ * The left hand side vector is always treated as a row vector.
+ *
+ * The resulting vector type is inherited from the LHS
+ *
+ *
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
 
-        // tile over row of left and column of right
-        auto right_tile = RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
-        right_tile.m_begin[1] = tile.m_begin[0];
-        right_tile.m_size[1] = tile.m_size[0];
-        right_tile.m_size[0] = tile_size;
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type::row_vector_type;
+  static constexpr camp::idx_t s_num_dims = 1;
 
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[0] = tile_size;
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Vector*Matrix"); }
 
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
+  RAJA_INLINE
 
-          // evaluate both sides of operator
-          right_tile.m_begin[0] = k;
-          auto right = et_right.eval(right_tile);
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return dim == 0 ? left.getDimSize(0) : 0;
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    // clear result
+    result_type result(0);
 
-          left_tile.m_begin[0] = k;
-          auto left = et_left.eval(left_tile);
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
 
-          // accumulate product
-          result = right.left_multiply_vector_accumulate(left, result);
+    return result;
+  }
 
-        }
-        // remainder tile in k
-        if(k < k_size){
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
+  template<typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const& add)
+  {
+    // evaluate add into result
+    result_type result = add.eval(tile);
+
+    // multiply left and right into result
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+private:
+  template<typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE& result,
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    auto tile_size = right_type::result_type::s_dim_elem(0);
+    auto k_size    = et_right.getDimSize(0);
 
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[0] = k;
-          left_part_tile.m_size[0] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
 
-          // compute product into x of partial tile
-          result = right.left_multiply_vector_accumulate(left, result);
-        }
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-      }
+    // tile over row of left and column of right
+    auto right_tile =
+        RIGHT_OPERAND_TYPE::result_type::s_get_default_tile().nonstatic();
+    right_tile.m_begin[1] = tile.m_begin[0];
+    right_tile.m_size[1]  = tile.m_size[0];
+    right_tile.m_size[0]  = tile_size;
+
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[0] = tile_size;
 
-    };
 
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
+
+      // evaluate both sides of operator
+      right_tile.m_begin[0] = k;
+      auto right            = et_right.eval(right_tile);
+
+      left_tile.m_begin[0] = k;
+      auto left            = et_left.eval(left_tile);
+
+      // accumulate product
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[0] = k;
+      left_part_tile.m_size[0]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      // compute product into x of partial tile
+      result = right.left_multiply_vector_accumulate(left, result);
+    }
+  }
+};
+
+/*!
+ * Specialization for matrix-matrix multiplication for TensorRegisters
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
+  RAJA_INLINE
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorRegisters
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+
+    /*
+     *
+     * For TensorRegister:
+     *
+     *   Return's a register containing product of left and right operands
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     * For TensorBlock:
+     *
+     *  Return's an ET TensorLiteral containing the left and right operrands
+     *
+     *  OR
+     *
+     *  Returns an ET multiply
      *
      */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
-    {
+    // create zeroed temporary
+    result_type result;
+    result.broadcast(0);
+
+    // multiply left and right operands into temporary
+    multiply_into_result(result, tile, left, right);
+
+    return result;
+  }
+
+  template<typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const& add)
+  {
 
-      using left_type = LEFT_OPERAND_TYPE;
-      using right_type = RIGHT_OPERAND_TYPE;
-      using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-      static constexpr camp::idx_t s_num_dims = 2;
+    // start accumulator with addition term
+    result_type result = add.eval(tile);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void print_ast() {
-        printf("Matrx*Matrix");
-      }
+    multiply_into_result(result, tile, left, right);
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-        return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-      }
+    return result;
+  }
 
-      /*!
-       * Evaluate operands and perform element-wise multiply
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
+private:
+  template<typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE& result,
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
+  {
+    // get tile size from matrix type
+    using right_tensor_type = typename right_type::result_type;
+    auto tile_size          = right_tensor_type::s_dim_elem(0);
+    auto k_size             = et_left.getDimSize(1);
 
-        /*
-         *
-         * For TensorRegister:
-         *
-         *   Return's a register containing product of left and right operands
-         *
-         * For TensorBlock:
-         *
-         *  Return's an ET TensorLiteral containing the left and right operrands
-         *
-         *  OR
-         *
-         *  Returns an ET multiply
-         *
-         */
-        // create zeroed temporary
-        result_type result;
-        result.broadcast(0);
-
-        // multiply left and right operands into temporary
-        multiply_into_result(result, tile, left,right);
-
-        return result;
-      }
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
 
-      template<typename TILE_TYPE, typename ADD_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add)
-      {
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
 
-        // start accumulator with addition term
-        result_type result = add.eval(tile);
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
 
-        multiply_into_result(result, tile, left, right);
 
-        return result;
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
+    {
 
-      }
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
 
-    private:
-      template<typename STORAGE, typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-      {
-        // get tile size from matrix type
-        using right_tensor_type = typename right_type::result_type;
-        auto tile_size = right_tensor_type::s_dim_elem(0);
-        auto k_size = et_left.getDimSize(1);
-
-        // TODO: check that left and right are compatible
-        // m_left.getDimSize(1) == m_right.getDimSize(0)
-        // how do we provide checking for this kind of error?
-
-        // tile over row of left and column of right
-        TILE_TYPE left_tile = tile;
-        left_tile.m_size[1] = tile_size;
-        auto left_begin = et_left.getDimBegin(1);
-
-        TILE_TYPE right_tile = tile;
-        right_tile.m_size[0] = tile_size;
-        auto right_begin = et_right.getDimBegin(0);
-
-
-        // Do full tiles in k
-        decltype(k_size) k = 0;
-        for(;k+tile_size <= k_size; k+= tile_size){
-
-          // evaluate both sides of operator
-          left_tile.m_begin[1] = k + left_begin;
-          auto left = et_left.eval(left_tile);
-
-          right_tile.m_begin[0] = k + right_begin;
-          auto right = et_right.eval(right_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-        // remainder tile in k
-        if(k < k_size){
-
-          auto &left_part_tile = make_tensor_tile_partial(left_tile);
-          left_part_tile.m_begin[1] = k + left_begin;
-          left_part_tile.m_size[1] = k_size-k;
-          auto left = et_left.eval(left_part_tile);
-
-          auto &right_part_tile = make_tensor_tile_partial(right_tile);
-          right_part_tile.m_begin[0] = k + right_begin;
-          right_part_tile.m_size[0] = k_size-k;
-          auto right = et_right.eval(right_part_tile);
-
-          // accumulate product
-          left.matrix_multiply_accumulate(result, right);
-        }
-      }
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
 
-    };
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
 
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
 
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
 
+      // accumulate product
+      left.matrix_multiply_accumulate(result, right);
+    }
+  }
+};
 
+template<typename OPERAND_TYPE, typename TILE_TYPE>
+class RestrictExtents
+    : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>>
+{
+public:
+  using self_type    = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
+  using operand_type = OPERAND_TYPE;
+  using result_type  = typename OPERAND_TYPE::result_type;
+  using index_type   = typename TILE_TYPE::index_type;
+  using tile_type    = TILE_TYPE;
+  static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+
+private:
+  operand_type m_operand;
+  tile_type m_tile;
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  RestrictExtents(operand_type const& operand, tile_type const& tile)
+      : m_operand {operand},
+        m_tile {tile}
+  {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_tile.m_size[dim];
+  }
 
-    template<typename OPERAND_TYPE, typename TILE_TYPE>
-    class RestrictExtents : public TensorExpressionBase<RestrictExtents<OPERAND_TYPE, TILE_TYPE>> {
-      public:
-        using self_type = RestrictExtents<OPERAND_TYPE, TILE_TYPE>;
-        using operand_type = OPERAND_TYPE;
-        using result_type = typename OPERAND_TYPE::result_type;
-        using index_type = typename TILE_TYPE::index_type;
-        using tile_type = TILE_TYPE;
-        static constexpr camp::idx_t s_num_dims = OPERAND_TYPE::s_num_dims;
+  RAJA_INLINE
 
-      private:
-        operand_type m_operand;
-        tile_type m_tile;
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimBegin(camp::idx_t dim) const
+  {
+    return m_tile.m_begin[dim];
+  }
 
-      public:
+  template<typename TILE_TYPE2>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE2 const& tile) const
+      -> decltype(m_operand.eval(tile))
+  {
+    return m_operand.eval(tile);
+  }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        RestrictExtents(operand_type const &operand, tile_type const &tile) :
-        m_operand{operand}, m_tile{tile}
-        {}
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("RestrictExtents(");
+    m_operand.print_ast();
+    printf(")");
+  }
+};
+
+template<typename OPERAND, typename TILE>
+RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const& operand,
+                                               TILE const& tile)
+{
+  using tile_type = typename OPERAND::tile_type;
+  tile_type new_tile;
+  new_tile.copy(tile);
+  return RestrictExtents<OPERAND, TILE>(operand, new_tile);
+}
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tile.m_size[dim];
-        }
+/*!
+ * Specialization for matrix-matrix multiplication for TensorBlocks
+ *
+ * By default the A*B operator for two matrices produces a matrix-matrix
+ * multiplication.
+ *
+ */
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimBegin(camp::idx_t dim) const {
-          return m_tile.m_begin[dim];
-        }
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct MultiplyOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<
+        std::is_base_of<TensorBlockConcreteBase,
+                        typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
+        LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+        RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using left_type   = LEFT_OPERAND_TYPE;
+  using right_type  = RIGHT_OPERAND_TYPE;
+  using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
+  static constexpr camp::idx_t s_num_dims = 2;
 
+  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
+  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename
+  //      RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
 
-        template<typename TILE_TYPE2>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE2 const &tile) const ->
-          decltype(m_operand.eval(tile))
-        {
-          return m_operand.eval(tile);
-        }
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("RestrictExtents(");
-          m_operand.print_ast();
-          printf(")");
-        }
+  // This tensor type is a TensorBlock of some kind
+  using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
 
+  // Get the storage type from the TensorBlock
+  using storage_type = typename tensor_type::storage_type;
 
-    };
+  // Create a BlockLiteral that uses the TensorBlock's indicated storage
+  // and has an eval() that produces the TensorBlock's register type
+  using block_literal =
+      BlockLiteral<storage_type, typename tensor_type::register_type>;
 
-    template<typename OPERAND, typename TILE>
-    RestrictExtents<OPERAND, TILE> restrictExtents(OPERAND const &operand, TILE const &tile){
-      using tile_type = typename OPERAND::tile_type;
-      tile_type new_tile;
-      new_tile.copy(tile);
-      return RestrictExtents<OPERAND, TILE>(operand, new_tile);
-    }
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static void print_ast() { printf("Matrx*Matrix"); }
 
+  RAJA_INLINE
 
-    /*!
-     * Specialization for matrix-matrix multiplication for TensorBlocks
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise multiply
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const&,
+      RIGHT_OPERAND_TYPE const&)  //->
+                                  /// decltype(TensorMultiply<decltype(left.eval(tile)),
+                                  /// decltype(right.eval(tile))>(left.eval(tile),
+                                  /// right.eval(tile)))
+  {
+
+    /*
+     * First pass:  just return a Multiply ET that evaluates the block
+     * with underlying TensorRegisters
+     *
+     *
+     * Second pass: we want to return a TensorLiteral ET node with the
+     * matrix product already evaluated.?
+     *
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
+     *
+     */
+    // create a BlockLiteral
+    block_literal result(tile);
+
+    // evaluate the block-wise product into result
+
+    // return TensorMultiply<decltype(left.eval(tile)),
+    // decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+  template<typename TILE_TYPE, typename ADD_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static block_literal multiply_add(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right,
+      ADD_TYPE const& add)  //->
+                            // decltype(TensorMultiplyAdd<decltype(left.eval(tile)),
+                            // decltype(right.eval(tile)),
+                            // decltype(add.eval(tile))>(left.eval(tile),
+                            // right.eval(tile), add.eval(tile)))
+  {
+    /*
+     * First pass:  we want to return a BlockLiteral ET node with the
+     * matrix product already evaluated.  We do this by creating
+     * a LoadStore node wrapping the BlockLiteral, and evaluating it as
+     * a sub-expression.
      *
-     * By default the A*B operator for two matrices produces a matrix-matrix
-     * multiplication.
+     * What we really care about is improving the data reuse: so perhaps
+     * returning a Multiply ET node with TensorLiteral nodes for each
+     * of the operands
      *
      */
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<
-    std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value &&
-    LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims==2>::type>
+    // create a BlockLiteral
+    using block_tile_type = typename block_literal::tile_type;
+    block_tile_type block_tile;
+    block_tile.copy(tile);
+    block_literal result(block_tile);
+
+    using ref_type        = typename block_literal::ref_type;
+    using load_store_type = TensorLoadStore<tensor_type, ref_type>;
+
+    // initialize the result with our addition term
+    auto result_et = load_store_type(result.get_ref()).eval(tile);
+    result_et      = add.eval(tile);
+
+    // return TensorMultiplyAdd<decltype(left.eval(tile)),
+    // decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile),
+    // right.eval(tile), add.eval(tile));
+
+    //          multiply_into_result(result_et, tile, restrictExtents(left,
+    //          tile), restrictExtents(right, tile));
+    multiply_into_result(result_et, tile, left, right);
+
+    // return the BlockLiterat ET
+    return result;
+  }
+
+private:
+  template<typename STORAGE, typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static void multiply_into_result(
+      STORAGE& result,
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& et_left,
+      RIGHT_OPERAND_TYPE const& et_right)
+  {
+
+    // get tile size from matrix type
+    auto tile_size = result_type::s_dim_elem(1);
+    auto k_size    = et_left.getDimSize(1);
+
+    // TODO: check that left and right are compatible
+    // m_left.getDimSize(1) == m_right.getDimSize(0)
+    // how do we provide checking for this kind of error?
+
+    // tile over row of left and column of right
+    TILE_TYPE left_tile = tile;
+    left_tile.m_size[1] = tile_size;
+    auto left_begin     = et_left.getDimBegin(1);
+
+    TILE_TYPE right_tile = tile;
+    right_tile.m_size[0] = tile_size;
+    auto right_begin     = et_right.getDimBegin(0);
+
+
+    // Do full tiles in k
+    decltype(k_size) k = 0;
+    for (; k + tile_size <= k_size; k += tile_size)
     {
-        using left_type = LEFT_OPERAND_TYPE;
-        using right_type = RIGHT_OPERAND_TYPE;
-        using result_type = typename LEFT_OPERAND_TYPE::result_type::product_type;
-        static constexpr camp::idx_t s_num_dims = 2;
 
-  //      static_assert(LEFT_OPERAND_TYPE::s_num_dims == 1, "WHAOO");
-  //      static_assert(! std::is_base_of<TensorBlockConcreteBase, typename RIGHT_OPERAND_TYPE::tensor_type>::value, "MATCH");
-
-
-        // This tensor type is a TensorBlock of some kind
-        using tensor_type = typename RIGHT_OPERAND_TYPE::tensor_type;
-
-        // Get the storage type from the TensorBlock
-        using storage_type = typename tensor_type::storage_type;
-
-        // Create a BlockLiteral that uses the TensorBlock's indicated storage
-        // and has an eval() that produces the TensorBlock's register type
-        using block_literal = BlockLiteral<storage_type,
-                                           typename tensor_type::register_type>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void print_ast() {
-          printf("Matrx*Matrix");
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right) {
-          return dim == 0 ? left.getDimSize(0) : right.getDimSize(1);
-        }
-
-        /*!
-         * Evaluate operands and perform element-wise multiply
-         */
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &) //->
-          ///decltype(TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile)))
-        {
-
-          /*
-           * First pass:  just return a Multiply ET that evaluates the block
-           * with underlying TensorRegisters
-           *
-           *
-           * Second pass: we want to return a TensorLiteral ET node with the
-           * matrix product already evaluated.?
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-          // create a BlockLiteral
-          block_literal result(tile);
-
-          // evaluate the block-wise product into result
-
-          //return TensorMultiply<decltype(left.eval(tile)), decltype(right.eval(tile))>(left.eval(tile), right.eval(tile));
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-        template<typename TILE_TYPE, typename ADD_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        block_literal multiply_add(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right, ADD_TYPE const &add) //->
-          //decltype(TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile)))
-        {
-          /*
-           * First pass:  we want to return a BlockLiteral ET node with the
-           * matrix product already evaluated.  We do this by creating
-           * a LoadStore node wrapping the BlockLiteral, and evaluating it as
-           * a sub-expression.
-           *
-           * What we really care about is improving the data reuse: so perhaps
-           * returning a Multiply ET node with TensorLiteral nodes for each
-           * of the operands
-           *
-           */
-
-          // create a BlockLiteral
-          using block_tile_type = typename block_literal::tile_type;
-          block_tile_type block_tile;
-          block_tile.copy(tile);
-          block_literal result(block_tile);
-
-          using ref_type = typename block_literal::ref_type;
-          using load_store_type = TensorLoadStore<tensor_type, ref_type>;
-
-          // initialize the result with our addition term
-          auto result_et = load_store_type(result.get_ref()).eval(tile);
-          result_et = add.eval(tile);
-
-          //return TensorMultiplyAdd<decltype(left.eval(tile)), decltype(right.eval(tile)), decltype(add.eval(tile))>(left.eval(tile), right.eval(tile), add.eval(tile));
-
-//          multiply_into_result(result_et, tile, restrictExtents(left, tile), restrictExtents(right, tile));
-          multiply_into_result(result_et, tile, left, right);
-
-          // return the BlockLiterat ET
-          return result;
-        }
-
-      private:
-
-        template<typename STORAGE, typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        void multiply_into_result(STORAGE &result, TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &et_left, RIGHT_OPERAND_TYPE const &et_right)
-        {
-
-          // get tile size from matrix type
-          auto tile_size = result_type::s_dim_elem(1);
-          auto k_size = et_left.getDimSize(1);
-
-          // TODO: check that left and right are compatible
-          // m_left.getDimSize(1) == m_right.getDimSize(0)
-          // how do we provide checking for this kind of error?
-
-          // tile over row of left and column of right
-          TILE_TYPE left_tile = tile;
-          left_tile.m_size[1] = tile_size;
-          auto left_begin = et_left.getDimBegin(1);
-
-          TILE_TYPE right_tile = tile;
-          right_tile.m_size[0] = tile_size;
-          auto right_begin = et_right.getDimBegin(0);
-
-
-
-          // Do full tiles in k
-          decltype(k_size) k = 0;
-          for(;k+tile_size <= k_size; k+= tile_size){
-
-
-            // evaluate both sides of operator
-            left_tile.m_begin[1] = k + left_begin;
-            auto left = et_left.eval(left_tile);
-
-            right_tile.m_begin[0] = k + right_begin;
-            auto right = et_right.eval(right_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
-          }
-          // remainder tile in k
-          if(k < k_size){
-
-            auto &left_part_tile = make_tensor_tile_partial(left_tile);
-            left_part_tile.m_begin[1] = k + left_begin;
-            left_part_tile.m_size[1] = k_size-k;
-            auto left = et_left.eval(left_part_tile);
-
-            auto &right_part_tile = make_tensor_tile_partial(right_tile);
-            right_part_tile.m_begin[0] = k + right_begin;
-            right_part_tile.m_size[0] = k_size-k;
-            auto right = et_right.eval(right_part_tile);
-
-            // accumulate product
-            //left.matrix_multiply_accumulate(result, right);
-            result += restrictExtents(left, left_part_tile) * restrictExtents(right, right_part_tile);
-          }
-        }
-    };
-
-
-  } // namespace ET
 
-  } // namespace internal
-} // namespace expt
+      // evaluate both sides of operator
+      left_tile.m_begin[1] = k + left_begin;
+      auto left            = et_left.eval(left_tile);
+
+      right_tile.m_begin[0] = k + right_begin;
+      auto right            = et_right.eval(right_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result +=
+          restrictExtents(left, left_tile) * restrictExtents(right, right_tile);
+    }
+    // remainder tile in k
+    if (k < k_size)
+    {
+
+      auto& left_part_tile      = make_tensor_tile_partial(left_tile);
+      left_part_tile.m_begin[1] = k + left_begin;
+      left_part_tile.m_size[1]  = k_size - k;
+      auto left                 = et_left.eval(left_part_tile);
+
+      auto& right_part_tile      = make_tensor_tile_partial(right_tile);
+      right_part_tile.m_begin[0] = k + right_begin;
+      right_part_tile.m_size[0]  = k_size - k;
+      auto right                 = et_right.eval(right_part_tile);
+
+      // accumulate product
+      // left.matrix_multiply_accumulate(result, right);
+      result += restrictExtents(left, left_part_tile) *
+                restrictExtents(right, right_part_tile);
+    }
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
index faa92747dd..ccff2f82cb 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorDivide.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,346 +32,388 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template<typename LEFT_OPERAND_TYPE,
+         typename RIGHT_OPERAND_TYPE,
+         class ENABLE = void>
+struct DivideOperator;
+
+/*!
+ * Specialization that provides dividing a scalar by a vector
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const&,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
   {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return numerator.divide(right.eval(tile));
+    }
+
+    return numerator.divide_n(right.eval(tile), tile.m_size[0]);
+  }
+};
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
 
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, class ENABLE = void>
-    struct DivideOperator;
 
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
 
-    /*!
-     * Specialization that provides dividing a scalar by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
+      return left.eval(tile).divide(denominator);
+    }
+    else
+    {
+      return left.eval(tile).divide_n(denominator, tile.m_size[0]);
+    }
+  }
+};
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_n(right.eval(tile), tile.m_size[0]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_n(denominator, tile.m_size[0]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 1 && RIGHT_OPERAND_TYPE::s_num_dims == 1>::type>
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
-        }
-      }
-    };
-
-
-
-
-
-
-    /*!
-     * Specialization that provides dividing a scalar by a matrix
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide_n(right.eval(tile), tile.m_size[0]);
+    }
+  }
+};
+
+/*!
+ * Specialization that provides dividing a scalar by a matrix
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 0 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+
+  using result_type = typename RIGHT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const&,
+                        RIGHT_OPERAND_TYPE const& right)
+  {
+    return right.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type numerator(left.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
+      return numerator.divide(right.eval(tile));
+    }
 
-      using result_type = typename RIGHT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = RIGHT_OPERAND_TYPE::s_num_dims;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &, RIGHT_OPERAND_TYPE const &right) {
-        return right.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type numerator(left.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return numerator.divide(right.eval(tile));
-        }
-
-        return numerator.divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a scalar
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+    return numerator.divide_nm(right.eval(tile), tile.m_size[0],
+                               tile.m_size[1]);
+  }
+};
+
+/*!
+ * Specialization that provides dividing a vector by a scalar
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 0>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    result_type denominator(right.eval(tile));
+
+    if (tile.s_tensor_size == TENSOR_FULL)
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        result_type denominator(right.eval(tile));
-
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(denominator);
-        }
-        else{
-          return left.eval(tile).divide_nm(denominator, tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    /*!
-     * Specialization that provides dividing a vector by a vector
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    struct DivideOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE,
-    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 && RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+      return left.eval(tile).divide(denominator);
+    }
+    else
     {
-      using result_type = typename LEFT_OPERAND_TYPE::result_type;
-      static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      int getDimSize(int dim, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &) {
-        return left.getDimSize(dim);
-      }
-
-      /*!
-       * Evaluate operands and perform element-wise divide
-       */
-      template<typename TILE_TYPE>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      result_type divide(TILE_TYPE const &tile, LEFT_OPERAND_TYPE const &left, RIGHT_OPERAND_TYPE const &right)
-      {
-        if(tile.s_tensor_size == TENSOR_FULL){
-          return left.eval(tile).divide(right.eval(tile));
-        }
-        else{
-          return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0], tile.m_size[1]);
-        }
-      }
-    };
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorDivide: public TensorExpressionBase<TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using divide_op = DivideOperator<left_operand_type, right_operand_type>;
-        using result_type = typename divide_op::result_type;
-        static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
-
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorDivide(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return divide_op::divide(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Divide(");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic / tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator/(LHS const &left_operand, RHS const &right_operand) ->
-    TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+      return left.eval(tile).divide_nm(denominator, tile.m_size[0],
+                                       tile.m_size[1]);
+    }
+  }
+};
+
+/*!
+ * Specialization that provides dividing a vector by a vector
+ */
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+struct DivideOperator<
+    LEFT_OPERAND_TYPE,
+    RIGHT_OPERAND_TYPE,
+    typename std::enable_if<LEFT_OPERAND_TYPE::s_num_dims == 2 &&
+                            RIGHT_OPERAND_TYPE::s_num_dims == 2>::type>
+{
+  using result_type = typename LEFT_OPERAND_TYPE::result_type;
+  static constexpr camp::idx_t s_num_dims = LEFT_OPERAND_TYPE::s_num_dims;
+
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static int getDimSize(int dim,
+                        LEFT_OPERAND_TYPE const& left,
+                        RIGHT_OPERAND_TYPE const&)
+  {
+    return left.getDimSize(dim);
+  }
+
+  /*!
+   * Evaluate operands and perform element-wise divide
+   */
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE static result_type divide(
+      TILE_TYPE const& tile,
+      LEFT_OPERAND_TYPE const& left,
+      RIGHT_OPERAND_TYPE const& right)
+  {
+    if (tile.s_tensor_size == TENSOR_FULL)
+    {
+      return left.eval(tile).divide(right.eval(tile));
+    }
+    else
     {
-      return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+      return left.eval(tile).divide_nm(right.eval(tile), tile.m_size[0],
+                                       tile.m_size[1]);
     }
+  }
+};
+
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorDivide : public TensorExpressionBase<
+                         TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type         = TensorDivide<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using element_type       = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type         = typename LEFT_OPERAND_TYPE::index_type;
+
+  using divide_op   = DivideOperator<left_operand_type, right_operand_type>;
+  using result_type = typename divide_op::result_type;
+  static constexpr camp::idx_t s_num_dims = divide_op::s_num_dims;
+
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorDivide(left_operand_type const& left_operand,
+               right_operand_type const& right_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand}
+  {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return divide_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    return divide_op::divide(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Divide(");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+/*
+ * Overload for:    arithmetic / tensorexpression
+
+ */
+template<
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator/(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorDivide<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
index 6720a304f2..995829f83e 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLiteral.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,76 +32,74 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template<typename TENSOR_TYPE>
+class TensorLiteral : public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>>
+{
+public:
+  using self_type    = TensorLiteral<TENSOR_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using result_type  = tensor_type;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return tensor_type::s_dim_elem(dim);
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  explicit TensorLiteral(tensor_type const& value) : m_value {value} {}
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const&) const
+  {
+    return result_type(m_value);
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("TensorLiteral()"); }
+
+private:
+  tensor_type m_value;
+};
+
+/*
+ * For TensorRegister nodes, we need to wrap this in a constant value ET node
+ */
+template<typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
+{
+  using return_type = TensorLiteral<RHS>;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
   {
+    return return_type(rhs);
+  }
+};
 
+}  // namespace ET
 
-    template<typename TENSOR_TYPE>
-    class TensorLiteral :  public TensorExpressionBase<TensorLiteral<TENSOR_TYPE>> {
-      public:
-        using self_type = TensorLiteral<TENSOR_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using result_type = tensor_type;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return tensor_type::s_dim_elem(dim);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLiteral(tensor_type const &value) :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &) const {
-          return result_type(m_value);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("TensorLiteral()");
-        }
-
-      private:
-        tensor_type m_value;
-    };
-
-
-    /*
-     * For TensorRegister nodes, we need to wrap this in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorRegisterConcreteBase, RHS>::value>::type>
-    {
-        using return_type = TensorLiteral<RHS>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
index 3b69552a32..0dddc0adef 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorLoadStore.hpp
@@ -25,7 +25,6 @@
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/TensorTileExec.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -34,220 +33,187 @@ namespace expt
 {
 
 
+namespace ET
+{
 
 
+template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+struct TensorStoreFunctor
+{
+  LHS_TYPE const& m_lhs;
+  RHS_TYPE const& m_rhs;
 
-  namespace ET
+  template<typename TILE_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(TILE_TYPE const& tile) const
   {
 
 
+    /*
+     *
+     * For recursive ET types, eval() produces a new ET, and
+     * eval_lhs() produces a new TensorLoadStore.
+     *
+     */
+
+    m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
+  }
+};
+
+template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto makeTensorStoreFunctor(
+    LHS_TYPE const& lhs,
+    RHS_TYPE const& rhs) -> TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
+{
+  return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE> {lhs, rhs};
+}
+
+template<typename TENSOR_TYPE, typename REF_TYPE>
+class TensorLoadStore
+    : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>>
+{
+public:
+  using self_type    = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
+  using tensor_type  = TENSOR_TYPE;
+  using element_type = typename TENSOR_TYPE::element_type;
+  using index_type   = typename REF_TYPE::index_type;
+  using ref_type     = REF_TYPE;
+  using tile_type    = typename REF_TYPE::tile_type;
+  using result_type  = TENSOR_TYPE;
+
+  static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
 
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    struct TensorStoreFunctor
-    {
-        LHS_TYPE const &m_lhs;
-        RHS_TYPE const &m_rhs;
-
-        template<typename TILE_TYPE>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void operator()(TILE_TYPE const &tile) const {
-
-
-          /*
-           *
-           * For recursive ET types, eval() produces a new ET, and
-           * eval_lhs() produces a new TensorLoadStore.
-           *
-           */
-
-          m_lhs.eval_lhs(tile) = m_rhs.eval(tile);
-
-        }
-    };
-
-    template<typename STORAGE, typename LHS_TYPE, typename RHS_TYPE>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    auto makeTensorStoreFunctor(LHS_TYPE const &lhs, RHS_TYPE const &rhs) ->
-    TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>
-    {
-      return TensorStoreFunctor<STORAGE, LHS_TYPE, RHS_TYPE>{lhs, rhs};
-    }
-
-
-    template<typename TENSOR_TYPE, typename REF_TYPE>
-    class TensorLoadStore : public TensorExpressionBase<TensorLoadStore<TENSOR_TYPE, REF_TYPE>> {
-      public:
-        using self_type = TensorLoadStore<TENSOR_TYPE, REF_TYPE>;
-        using tensor_type = TENSOR_TYPE;
-        using element_type = typename TENSOR_TYPE::element_type;
-        using index_type = typename REF_TYPE::index_type;
-        using ref_type = REF_TYPE;
-        using tile_type = typename REF_TYPE::tile_type;
-        using result_type = TENSOR_TYPE;
-
-        static constexpr camp::idx_t s_num_dims = result_type::s_num_dims;
-
-
-      private:
-        ref_type m_ref;
-
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        TensorLoadStore(ref_type const &ref) : m_ref{ref}
-        {
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorLoadStore(self_type const &rhs) : m_ref(rhs.m_ref)
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print() const {
-          printf("TensorLoadStore: ");
-          m_ref.m_tile.print();
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(self_type const &rhs)
-        {
-          store(rhs);
-          return *this;
-        }
-
-//        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator=(RHS const &rhs)
-        {
-
-          store(normalizeOperand(rhs));
-
-          return *this;
-        }
-
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator+=(RHS const &rhs)
-        {
-          store( normalizeOperand(rhs) + (*this) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type &operator-=(RHS const &rhs)
-        {
-          store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator*=(RHS const &rhs)
-        {
-          store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename RHS>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator/=(RHS const &rhs)
-        {
-          store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)) );
-          return *this;
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
-        {
-          return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
-        }
-
-        RAJA_SUPPRESS_HD_WARN
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval_lhs(TILE_TYPE const &tile) const ->
-          decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref, tile)))
-        {
-          return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_ref.m_tile.m_size[dim];
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Load()");
-        }
-
-      private:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        tile_type const &getTile() const {
-          return m_ref.m_tile;
-        }
-
-
-        template<typename RHS>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void store(RHS const &rhs)
-        {
-#ifdef RAJA_DEBUG_PRINT_ET_AST
-          printf("Store(");
-          rhs.print_ast();
-          printf(")\n");
-#endif
 
-          tensorTileExec<tensor_type>(m_ref.m_tile,
-              makeTensorStoreFunctor<tensor_type>(*this, rhs));
-        }
+private:
+  ref_type m_ref;
 
 
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  explicit TensorLoadStore(ref_type const& ref) : m_ref {ref} {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorLoadStore(self_type const& rhs) : m_ref(rhs.m_ref) {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print() const
+  {
+    printf("TensorLoadStore: ");
+    m_ref.m_tile.print();
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator=(self_type const& rhs)
+  {
+    store(rhs);
+    return *this;
+  }
+
+  //        RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(RHS const& rhs)
+  {
 
+    store(normalizeOperand(rhs));
 
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator+=(RHS const& rhs)
+  {
+    store(normalizeOperand(rhs) + (*this));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator-=(RHS const& rhs)
+  {
+    store(TensorSubtract<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*=(RHS const& rhs)
+  {
+    store(TensorMultiply<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator/=(RHS const& rhs)
+  {
+    store(TensorDivide<self_type, RHS>(*this, normalizeOperand(rhs)));
+    return *this;
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(tensor_type::s_load_ref(merge_ref_tile(m_ref, tile)))
+  {
+    return tensor_type::s_load_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval_lhs(TILE_TYPE const& tile) const
+      -> decltype(TENSOR_TYPE::create_et_store_ref(merge_ref_tile(this->m_ref,
+                                                                  tile)))
+  {
+    return TENSOR_TYPE::create_et_store_ref(merge_ref_tile(m_ref, tile));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
+  {
+    return m_ref.m_tile.m_size[dim];
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("Load()"); }
+
+private:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  tile_type const& getTile() const { return m_ref.m_tile; }
+
+  template<typename RHS>
+  RAJA_INLINE RAJA_HOST_DEVICE void store(RHS const& rhs)
+  {
+#ifdef RAJA_DEBUG_PRINT_ET_AST
+    printf("Store(");
+    rhs.print_ast();
+    printf(")\n");
+#endif
 
-    };
+    tensorTileExec<tensor_type>(
+        m_ref.m_tile, makeTensorStoreFunctor<tensor_type>(*this, rhs));
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
index 3e3429588f..d8b80d0853 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiply.hpp
@@ -25,7 +25,6 @@
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 #include "RAJA/pattern/tensor/internal/ET/MultiplyOperator.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,127 +32,136 @@ namespace internal
 namespace expt
 {
 
-  namespace ET
+namespace ET
+{
+
+// forward decl for FMA contraction
+template<typename LEFT_OPERAND_TYPE,
+         typename RIGHT_OPERAND_TYPE,
+         typename ADD_TYPE>
+class TensorMultiplyAdd;
+
+template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
+class TensorMultiply
+    : public TensorExpressionBase<
+          TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>>
+{
+public:
+  using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorMultiply(left_operand_type const& left_operand,
+                 right_operand_type const& right_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand}
+  {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr int getDimSize(int dim) const
+  {
+    return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
+  }
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
   {
+    return multiply_op::multiply(tile, m_left_operand, m_right_operand);
+  }
+
+  /*!
+   * Returns the LHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr left_operand_type const& getLeftOperand() const
+  {
+    return m_left_operand;
+  }
+
+  /*!
+   * Returns the RHS of the operation, used to form contractions
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr right_operand_type const& getRightOperand() const
+  {
+    return m_right_operand;
+  }
+
+  /*!
+   * operator+ overload that forms a FMA contraction
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template<typename ADD>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorMultiplyAdd<left_operand_type,
+                                                 right_operand_type,
+                                                 normalize_operand_t<ADD>>
+  operator+(ADD const& add) const
+  {
+    return TensorMultiplyAdd<left_operand_type, right_operand_type,
+                             normalize_operand_t<ADD>>(
+        m_left_operand, m_right_operand, normalizeOperand(add));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Multiply[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(")");
+  }
+};
+
+/*
+ * Overload for:    arithmetic * tensorexpression
+
+ */
+template<
+    typename LHS,
+    typename RHS,
+    typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value,
+        bool>::type = true>
+RAJA_INLINE RAJA_HOST_DEVICE auto operator*(LHS const& left_operand,
+                                            RHS const& right_operand)
+    -> TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
+{
+  return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(
+      NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
+}
+
+}  // namespace ET
 
-    // forward decl for FMA contraction
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_TYPE>
-    class TensorMultiplyAdd;
-
-
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE>
-    class TensorMultiply : public TensorExpressionBase<TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiply<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-
-      public:
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiply(left_operand_type const &left_operand, right_operand_type const &right_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}
-        {}
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        int getDimSize(int dim) const {
-          return multiply_op::getDimSize(dim, m_left_operand, m_right_operand);
-        }
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply(tile, m_left_operand, m_right_operand))
-        {
-          return multiply_op::multiply(tile, m_left_operand, m_right_operand);
-        }
-
-        /*!
-         * Returns the LHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        left_operand_type const &getLeftOperand() const {
-          return m_left_operand;
-        }
-
-        /*!
-         * Returns the RHS of the operation, used to form contractions
-         */
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        right_operand_type const &getRightOperand() const {
-          return m_right_operand;
-        }
-
-
-        /*!
-         * operator+ overload that forms a FMA contraction
-         */
-        RAJA_SUPPRESS_HD_WARN
-        template<typename ADD>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>
-        operator+(ADD const &add) const {
-          return TensorMultiplyAdd<left_operand_type, right_operand_type, normalize_operand_t<ADD>>(m_left_operand, m_right_operand, normalizeOperand(add));
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Multiply[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(")");
-        }
-
-
-    };
-
-
-    /*
-     * Overload for:    arithmetic * tensorexpression
-
-     */
-    template<typename LHS, typename RHS,
-      typename std::enable_if<std::is_arithmetic<LHS>::value, bool>::type = true,
-      typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto operator*(LHS const &left_operand, RHS const &right_operand) ->
-    TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>
-    {
-      return TensorMultiply<typename NormalizeOperandHelper<LHS>::return_type, RHS>(NormalizeOperandHelper<LHS>::normalize(left_operand), right_operand);
-    }
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
index 44f27e92c7..b1073ee8e3 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorMultiplyAdd.hpp
@@ -33,81 +33,90 @@ namespace expt
 {
 
 
-  namespace ET
-  {
-
-
-    /*!
-     * Expression for LHS*RHS+ADD, which allows for accessing FMA style
-     * operations.
-     *
-     * This ET can only be generated by contracting an Add and Multiple ET.
-     *
-     */
-    template<typename LEFT_OPERAND_TYPE, typename RIGHT_OPERAND_TYPE, typename ADD_OPERAND_TYPE>
-    class TensorMultiplyAdd : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>> {
-      public:
-        using self_type = TensorMultiplyAdd<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE, ADD_OPERAND_TYPE>;
-        using left_operand_type = LEFT_OPERAND_TYPE;
-        using right_operand_type = RIGHT_OPERAND_TYPE;
-        using add_operand_type = ADD_OPERAND_TYPE;
-        using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
-
-        using element_type = typename LEFT_OPERAND_TYPE::element_type;
-        using index_type = typename LEFT_OPERAND_TYPE::index_type;
-
-        using result_type = typename multiply_op::result_type;
-        static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
-
-      private:
-        left_operand_type m_left_operand;
-        right_operand_type m_right_operand;
-        add_operand_type m_add_operand;
-
-      public:
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorMultiplyAdd(left_operand_type const &left_operand, right_operand_type const &right_operand,
-                          add_operand_type const &add_operand) :
-        m_left_operand{left_operand}, m_right_operand{right_operand}, m_add_operand{add_operand}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        auto eval(TILE_TYPE const &tile) const ->
-          decltype(multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand))
-        {
-          return multiply_op::multiply_add(tile, m_left_operand, m_right_operand, m_add_operand);
-        }
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("MultiplyAdd[");
-          multiply_op::print_ast();
-          printf("](");
-          m_left_operand.print_ast();
-          printf(", ");
-          m_right_operand.print_ast();
-          printf(", ");
-          m_add_operand.print_ast();
-          printf(")");
-        }
-
-
-
-    };
-
+namespace ET
+{
 
 
+/*!
+ * Expression for LHS*RHS+ADD, which allows for accessing FMA style
+ * operations.
+ *
+ * This ET can only be generated by contracting an Add and Multiple ET.
+ *
+ */
+template<typename LEFT_OPERAND_TYPE,
+         typename RIGHT_OPERAND_TYPE,
+         typename ADD_OPERAND_TYPE>
+class TensorMultiplyAdd
+    : public TensorExpressionBase<TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                                    RIGHT_OPERAND_TYPE,
+                                                    ADD_OPERAND_TYPE>>
+{
+public:
+  using self_type          = TensorMultiplyAdd<LEFT_OPERAND_TYPE,
+                                      RIGHT_OPERAND_TYPE,
+                                      ADD_OPERAND_TYPE>;
+  using left_operand_type  = LEFT_OPERAND_TYPE;
+  using right_operand_type = RIGHT_OPERAND_TYPE;
+  using add_operand_type   = ADD_OPERAND_TYPE;
+  using multiply_op = MultiplyOperator<LEFT_OPERAND_TYPE, RIGHT_OPERAND_TYPE>;
+
+  using element_type = typename LEFT_OPERAND_TYPE::element_type;
+  using index_type   = typename LEFT_OPERAND_TYPE::index_type;
+
+  using result_type                       = typename multiply_op::result_type;
+  static constexpr camp::idx_t s_num_dims = multiply_op::s_num_dims;
+
+private:
+  left_operand_type m_left_operand;
+  right_operand_type m_right_operand;
+  add_operand_type m_add_operand;
+
+public:
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorMultiplyAdd(left_operand_type const& left_operand,
+                    right_operand_type const& right_operand,
+                    add_operand_type const& add_operand)
+      : m_left_operand {left_operand},
+        m_right_operand {right_operand},
+        m_add_operand {add_operand}
+  {}
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE auto eval(TILE_TYPE const& tile) const
+      -> decltype(multiply_op::multiply_add(tile,
+                                            m_left_operand,
+                                            m_right_operand,
+                                            m_add_operand))
+  {
+    return multiply_op::multiply_add(tile, m_left_operand, m_right_operand,
+                                     m_add_operand);
+  }
 
-  } // namespace ET
+  RAJA_INLINE
 
-  } // namespace internal
-} // namespace expt
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("MultiplyAdd[");
+    multiply_op::print_ast();
+    printf("](");
+    m_left_operand.print_ast();
+    printf(", ");
+    m_right_operand.print_ast();
+    printf(", ");
+    m_add_operand.print_ast();
+    printf(")");
+  }
+};
+
+
+}  // namespace ET
+
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
index d5211e4963..db0594b3e5 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorNegate.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,61 +32,61 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template<typename ET_TYPE>
+class TensorNegate : public TensorExpressionBase<TensorNegate<ET_TYPE>>
+{
+public:
+  using self_type    = TensorNegate<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorNegate(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    return m_tensor.eval(tile).scale(-1);
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Negate(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorNegate :  public TensorExpressionBase<TensorNegate<ET_TYPE>> {
-      public:
-        using self_type = TensorNegate<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorNegate(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const
-        {
-          return m_tensor.eval(tile).scale(-1);
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Negate(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
index 4ab0a3ebc6..0f274ab811 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorScalarLiteral.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,78 +32,73 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+
+template<typename T>
+class TensorScalarLiteral : public TensorExpressionBase<TensorScalarLiteral<T>>
+{
+public:
+  using self_type    = TensorScalarLiteral<T>;
+  using tensor_type  = RAJA::expt::ScalarRegister<T>;
+  using element_type = T;
+  using result_type  = T;
+  using index_type   = RAJA::Index_type;
+
+  static constexpr camp::idx_t s_num_dims = 0;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type) const { return 0; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  explicit constexpr TensorScalarLiteral(element_type const& value) noexcept
+      : m_value {value}
+  {}
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE element_type eval(TILE_TYPE const&) const
   {
+    return m_value;
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const { printf("ScalarLiteral(%e)", (double)m_value); }
 
+private:
+  element_type m_value;
+};
 
-    template<typename T>
-    class TensorScalarLiteral :  public TensorExpressionBase<TensorScalarLiteral<T>> {
-      public:
-        using self_type = TensorScalarLiteral<T>;
-        using tensor_type = RAJA::expt::ScalarRegister<T>;
-        using element_type = T;
-        using result_type = T;
-        using index_type = RAJA::Index_type;
-
-        static constexpr camp::idx_t s_num_dims = 0;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type ) const {
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        explicit
-        constexpr
-        TensorScalarLiteral(element_type const &value) noexcept :
-        m_value{value}
-        {}
-
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        element_type eval(TILE_TYPE const &) const {
-          return m_value;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("ScalarLiteral(%e)", (double)m_value);
-        }
-
-      private:
-        element_type m_value;
-    };
-
-
-    /*
-     * For arithmetic values, we need to wrap in a constant value ET node
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
+/*
+ * For arithmetic values, we need to wrap in a constant value ET node
+ */
+template<typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
     typename std::enable_if<std::is_arithmetic<RHS>::value>::type>
-    {
-        using return_type = TensorScalarLiteral<RHS>;
+{
+  using return_type = TensorScalarLiteral<RHS>;
 
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return return_type(rhs);
-        }
-    };
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs)
+  {
+    return return_type(rhs);
+  }
+};
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
index 46950eec6f..9ba01af1a9 100644
--- a/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/ET/ExpressionTemplateBase.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,67 +32,66 @@ namespace expt
 {
 
 
-  namespace ET
+namespace ET
+{
+
+template<typename ET_TYPE>
+class TensorTranspose : public TensorExpressionBase<TensorTranspose<ET_TYPE>>
+{
+public:
+  using self_type    = TensorTranspose<ET_TYPE>;
+  using rhs_type     = ET_TYPE;
+  using tensor_type  = typename ET_TYPE::result_type;
+  using element_type = typename tensor_type::element_type;
+  using index_type   = typename ET_TYPE::index_type;
+
+  using result_type                       = tensor_type;
+  using tile_type                         = typename ET_TYPE::tile_type;
+  static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorTranspose(rhs_type const& tensor) : m_tensor {tensor} {}
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  constexpr index_type getDimSize(index_type dim) const
   {
+    return m_tensor.getDimSize(dim);
+  }
+
+  template<typename TILE_TYPE>
+  RAJA_INLINE RAJA_HOST_DEVICE result_type eval(TILE_TYPE const& tile) const
+  {
+    // transpose which tile we are returning
+    TILE_TYPE trans_tile {{tile.m_begin[1], tile.m_begin[0]},
+                          {tile.m_size[1], tile.m_size[0]}};
+
+    // evaluate and return the transposed tile
+    return m_tensor.eval(trans_tile).transpose();
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  void print_ast() const
+  {
+    printf("Transpose(");
+    m_tensor.print_ast();
+    printf(")");
+  }
+
+private:
+  rhs_type m_tensor;
+};
+
+
+}  // namespace ET
 
-    template<typename ET_TYPE>
-    class TensorTranspose :  public TensorExpressionBase<TensorTranspose<ET_TYPE>> {
-      public:
-        using self_type = TensorTranspose<ET_TYPE>;
-        using rhs_type = ET_TYPE;
-        using tensor_type = typename ET_TYPE::result_type;
-        using element_type = typename tensor_type::element_type;
-        using index_type = typename ET_TYPE::index_type;
-
-        using result_type = tensor_type;
-        using tile_type = typename ET_TYPE::tile_type;
-        static constexpr camp::idx_t s_num_dims = ET_TYPE::s_num_dims;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        TensorTranspose(rhs_type const &tensor) :
-        m_tensor{tensor}
-        {}
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        constexpr
-        index_type getDimSize(index_type dim) const {
-          return m_tensor.getDimSize(dim);
-        }
-
-        template<typename TILE_TYPE>
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        result_type eval(TILE_TYPE const &tile) const {
-          // transpose which tile we are returning
-          TILE_TYPE trans_tile{
-            {tile.m_begin[1], tile.m_begin[0]},
-            {tile.m_size[1],  tile.m_size[0]}
-          };
-
-          // evaluate and return the transposed tile
-          return m_tensor.eval(trans_tile).transpose();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        void print_ast() const {
-          printf("Transpose(");
-          m_tensor.print_ast();
-          printf(")");
-        }
-
-      private:
-        rhs_type m_tensor;
-    };
-
-
-
-  } // namespace ET
-
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
index 2a868a3131..0f492c6c43 100644
--- a/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
+++ b/include/RAJA/pattern/tensor/internal/ET/normalizeOperand.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/tensor/internal/TensorRef.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -33,64 +32,56 @@ namespace expt
 {
 
 
-    class TensorRegisterConcreteBase;
-
-  namespace ET
-  {
-    class TensorExpressionConcreteBase;
-
-    template<typename RHS, typename enable = void>
-    struct NormalizeOperandHelper;
+class TensorRegisterConcreteBase;
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
 
-    /*
-     * For TensorExpression nodes, we just return them as-is.
-     */
-    template<typename RHS>
-    struct NormalizeOperandHelper<RHS,
-    typename std::enable_if<std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
-    {
-        using return_type = RHS;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        return_type normalize(RHS const &rhs){
-          return rhs;
-        }
-    };
+template<typename RHS, typename enable = void>
+struct NormalizeOperandHelper;
 
+/*
+ * For TensorExpression nodes, we just return them as-is.
+ */
+template<typename RHS>
+struct NormalizeOperandHelper<
+    RHS,
+    typename std::enable_if<
+        std::is_base_of<TensorExpressionConcreteBase, RHS>::value>::type>
+{
+  using return_type = RHS;
 
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static constexpr return_type normalize(RHS const& rhs) { return rhs; }
+};
 
-    /**
-     * Allows uniform packaging up of operands into ExpressionTemplates.
-     *
-     * The NormalizeOperandHelper is specialized throughout the code in order
-     * to convert non-ET operands into ET objects
-     *
-     * ET operators can then take any operand type, and use this to convert
-     * them into ET types the same way.
-     */
-    template<typename RHS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    auto normalizeOperand(RHS const &rhs) ->
+/**
+ * Allows uniform packaging up of operands into ExpressionTemplates.
+ *
+ * The NormalizeOperandHelper is specialized throughout the code in order
+ * to convert non-ET operands into ET objects
+ *
+ * ET operators can then take any operand type, and use this to convert
+ * them into ET types the same way.
+ */
+template<typename RHS>
+RAJA_INLINE RAJA_HOST_DEVICE auto normalizeOperand(RHS const& rhs) ->
     typename NormalizeOperandHelper<RHS>::return_type
-    {
-      return NormalizeOperandHelper<RHS>::normalize(rhs);
-    }
+{
+  return NormalizeOperandHelper<RHS>::normalize(rhs);
+}
 
-    template<typename RHS>
-    using normalize_operand_t =
-        typename NormalizeOperandHelper<RHS>::return_type;
+template<typename RHS>
+using normalize_operand_t = typename NormalizeOperandHelper<RHS>::return_type;
 
 
-  } // namespace ET
+}  // namespace ET
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
index 2b6bf7304d..a94ec924db 100644
--- a/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
+++ b/include/RAJA/pattern/tensor/internal/ExpressionTemplate.hpp
@@ -31,5 +31,4 @@
 #include "RAJA/pattern/tensor/internal/ET/TensorTranspose.hpp"
 
 
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
index 08a9886acc..4c54b89954 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/config.hpp"
 #include "RAJA/pattern/tensor/MatrixRegister.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -31,308 +30,323 @@ namespace expt
 {
 
 
+template<typename MATA, typename MATB>
+struct MatrixMatrixMultiplyHelper;
 
+/**
+ *
+ * Row-Major * Row-Major ==> Row-Major
+ *
+ */
+template<typename T,
+         typename REGISTER_POLICY,
+         camp::idx_t N_SIZE,
+         camp::idx_t M_SIZE,
+         camp::idx_t M2_SIZE,
+         camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::RowMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-
-
-
-  template<typename MATA, typename MATB>
-  struct MatrixMatrixMultiplyHelper;
-
-
-
-  /**
-   *
-   * Row-Major * Row-Major ==> Row-Major
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::RowMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::RowMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::RowMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix B (and C) has 1 more more registers per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-  struct MatrixMatrixMultiplyHelper<
-  RAJA::expt::TensorRegister<REGISTER_POLICY,
-                   T,
-                   RAJA::expt::RowMajorLayout,
-                   camp::idx_seq<N_SIZE, M_SIZE>>,
-                   RAJA::expt::TensorRegister<REGISTER_POLICY,
-                    T,
-                    RAJA::expt::RowMajorLayout,
-                    camp::idx_seq<M2_SIZE, O_SIZE>> >
-    {
-
-      static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-      using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                       T,
-                                       RAJA::expt::RowMajorLayout,
-                                       camp::idx_seq<N_SIZE, M_SIZE>>;
-
-      using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                        T,
-                                        RAJA::expt::RowMajorLayout,
-                                        camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-      using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::RowMajorLayout,
-                                         camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-      using register_type = typename result_type::register_type;
-
-      static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-      static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-      static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-      /*
-       * Matrix B (and C) has 1 more more registers per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
+  template<typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
       typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-      {
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-        RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
 #endif
 
-        constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
-
-        RAJA_UNROLL
-        for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-          camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
-          camp::idx_t ac_row = c_reg / num_bc_reg_per_row;
-
-          RAJA_UNROLL
-          for(camp::idx_t a_col = 0;a_col < M_SIZE;++ a_col){
-            camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
-
-            C.get_register(c_reg) =
-                register_type(A.get(ac_row, a_col)).multiply_add(
-                    B.get_register(b_reg),
-                    C.get_register(c_reg));
-          }
-        }
+    constexpr camp::idx_t num_bc_reg_per_row = s_C_minor_dim_registers;
 
-      }
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t bc_col_reg = c_reg % num_bc_reg_per_row;
+      camp::idx_t ac_row     = c_reg / num_bc_reg_per_row;
 
-      /*
-       * Matrix B (and C) have less than one register per row
-       *
-       */
-      template<typename dummy = void>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-      multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+      RAJA_UNROLL
+      for (camp::idx_t a_col = 0; a_col < M_SIZE; ++a_col)
       {
-        constexpr camp::idx_t bc_segbits = result_type::s_segbits;
-        constexpr camp::idx_t a_segments_per_register = 1<<bc_segbits;
-
-        RAJA_UNROLL
-        for(camp::idx_t ac_row = 0;ac_row < N_SIZE;++ ac_row){
-          camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
-          camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
-          register_type c_tmp;
-
-          RAJA_UNROLL
-          for(camp::idx_t b_reg = 0;b_reg < right_type::s_num_registers;++ b_reg){
-
-            camp::idx_t a_segment = ac_row*right_type::s_num_registers + b_reg;
-            camp::idx_t a_reg = a_segment / a_segments_per_register;
-            camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
-
-            auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(bc_segbits, a_reg_segment);
-
-            if(b_reg == 0){
+        camp::idx_t b_reg = a_col * num_bc_reg_per_row + bc_col_reg;
 
-              c_tmp = a_tmp.multiply(B.get_register(b_reg));
-            }
-            else{
-              c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
-            }
-
-          }
-
-          C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
-
-        }
-
-      }
-
-      RAJA_HOST_DEVICE
-      static
-      RAJA_INLINE
-      void multiply(left_type const &A, right_type const &B, result_type &C){
-        C = result_type(0);
-        multiply_accumulate(A, B, C);
+        C.get_register(c_reg) =
+            register_type(A.get(ac_row, a_col))
+                .multiply_add(B.get_register(b_reg), C.get_register(c_reg));
       }
-  };
+    }
+  }
 
-
-  /**
-   *
-   * Column-Major * Column-Major ==> Column-Major
+  /*
+   * Matrix B (and C) have less than one register per row
    *
    */
-  template<typename T, typename REGISTER_POLICY, camp::idx_t N_SIZE, camp::idx_t M_SIZE, camp::idx_t M2_SIZE, camp::idx_t O_SIZE>
-    struct MatrixMatrixMultiplyHelper<
-    RAJA::expt::TensorRegister<REGISTER_POLICY,
-                     T,
-                     RAJA::expt::ColMajorLayout,
-                     camp::idx_seq<N_SIZE, M_SIZE>>,
-                     RAJA::expt::TensorRegister<REGISTER_POLICY,
-                      T,
-                      RAJA::expt::ColMajorLayout,
-                      camp::idx_seq<M2_SIZE, O_SIZE>> >
-      {
-
-      using self_type = MatrixMatrixMultiplyHelper<
-          RAJA::expt::TensorRegister<REGISTER_POLICY,
-                         T,
-                         RAJA::expt::ColMajorLayout,
-                         camp::idx_seq<N_SIZE, M_SIZE>>,
-                         RAJA::expt::TensorRegister<REGISTER_POLICY,
-                          T,
-                          RAJA::expt::ColMajorLayout,
-                          camp::idx_seq<M2_SIZE, O_SIZE>> >;
-
-        static_assert(M_SIZE == M2_SIZE, "Matrices are not compatible for multiplication");
-
-        using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                         T,
-                                         RAJA::expt::ColMajorLayout,
-                                         camp::idx_seq<N_SIZE, M_SIZE>>;
-
-        using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                          T,
-                                          RAJA::expt::ColMajorLayout,
-                                          camp::idx_seq<M_SIZE, O_SIZE>> ;
-
-        using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
-                                           T,
-                                           RAJA::expt::ColMajorLayout,
-                                           camp::idx_seq<N_SIZE, O_SIZE>> ;
-
-        using register_type = typename result_type::register_type;
-
-        static constexpr camp::idx_t s_elements_per_register = left_type::s_elements_per_register;
-        static constexpr camp::idx_t s_A_minor_dim_registers = left_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_B_minor_dim_registers = right_type::s_minor_dim_registers;
-        static constexpr camp::idx_t s_C_minor_dim_registers = result_type::s_minor_dim_registers;
-
-
-
-        /*
-         * Matrix A (and C) has 1 more more registers per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
-        {
-
-  #if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
-          RAJA::tensor_stats::num_matrix_mm_multacc_row_row ++;
-  #endif
-
-
-          constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
+  template<typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t bc_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t a_segments_per_register = 1 << bc_segbits;
+
+    RAJA_UNROLL
+    for (camp::idx_t ac_row = 0; ac_row < N_SIZE; ++ac_row)
+    {
+      camp::idx_t c_reg     = ac_row / result_type::s_major_dim_per_register;
+      camp::idx_t c_segment = ac_row % result_type::s_major_dim_per_register;
+      register_type c_tmp;
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < result_type::s_num_registers;++ c_reg){
-            camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
-            camp::idx_t bc_col = c_reg / num_ac_reg_per_col;
+      RAJA_UNROLL
+      for (camp::idx_t b_reg = 0; b_reg < right_type::s_num_registers; ++b_reg)
+      {
 
-            RAJA_UNROLL
-            for(camp::idx_t b_row = 0;b_row < M_SIZE;++ b_row){
-              camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
+        camp::idx_t a_segment = ac_row * right_type::s_num_registers + b_reg;
+        camp::idx_t a_reg     = a_segment / a_segments_per_register;
+        camp::idx_t a_reg_segment = a_segment % a_segments_per_register;
 
-              C.get_register(c_reg) =
-                  register_type(B.get(b_row, bc_col)).multiply_add(
-                      A.get_register(a_reg),
-                      C.get_register(c_reg));
-            }
-          }
+        auto a_tmp = A.get_register(a_reg).segmented_broadcast_outer(
+            bc_segbits, a_reg_segment);
 
+        if (b_reg == 0)
+        {
 
+          c_tmp = a_tmp.multiply(B.get_register(b_reg));
         }
-
-        /*
-         * Matrix A (and C) have less than one register per column
-         *
-         */
-        template<typename dummy = void>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static
-        typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
-        multiply_accumulate(left_type const &A, right_type const &B, result_type &C)
+        else
         {
-          constexpr camp::idx_t ac_segbits = result_type::s_segbits;
-          constexpr camp::idx_t b_segments_per_register = 1<<ac_segbits;
+          c_tmp = a_tmp.multiply_add(B.get_register(b_reg), c_tmp);
+        }
+      }
 
-          camp::idx_t bc_col = 0;
+      C.get_register(c_reg) += c_tmp.segmented_sum_outer(bc_segbits, c_segment);
+    }
+  }
+
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void multiply(left_type const& A,
+                                   right_type const& B,
+                                   result_type& C)
+  {
+    C = result_type(0);
+    multiply_accumulate(A, B, C);
+  }
+};
+
+/**
+ *
+ * Column-Major * Column-Major ==> Column-Major
+ *
+ */
+template<typename T,
+         typename REGISTER_POLICY,
+         camp::idx_t N_SIZE,
+         camp::idx_t M_SIZE,
+         camp::idx_t M2_SIZE,
+         camp::idx_t O_SIZE>
+struct MatrixMatrixMultiplyHelper<
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<N_SIZE, M_SIZE>>,
+    RAJA::expt::TensorRegister<REGISTER_POLICY,
+                               T,
+                               RAJA::expt::ColMajorLayout,
+                               camp::idx_seq<M2_SIZE, O_SIZE>>>
+{
 
-          RAJA_UNROLL
-          for(camp::idx_t c_reg = 0;c_reg < N_SIZE/result_type::s_major_dim_per_register;++ c_reg){
+  using self_type = MatrixMatrixMultiplyHelper<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<N_SIZE, M_SIZE>>,
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::ColMajorLayout,
+                                 camp::idx_seq<M2_SIZE, O_SIZE>>>;
+
+  static_assert(M_SIZE == M2_SIZE,
+                "Matrices are not compatible for multiplication");
+
+  using left_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                               T,
+                                               RAJA::expt::ColMajorLayout,
+                                               camp::idx_seq<N_SIZE, M_SIZE>>;
+
+  using right_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                T,
+                                                RAJA::expt::ColMajorLayout,
+                                                camp::idx_seq<M_SIZE, O_SIZE>>;
+
+  using result_type = RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                                 T,
+                                                 RAJA::expt::ColMajorLayout,
+                                                 camp::idx_seq<N_SIZE, O_SIZE>>;
+
+  using register_type = typename result_type::register_type;
+
+  static constexpr camp::idx_t s_elements_per_register =
+      left_type::s_elements_per_register;
+  static constexpr camp::idx_t s_A_minor_dim_registers =
+      left_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_B_minor_dim_registers =
+      right_type::s_minor_dim_registers;
+  static constexpr camp::idx_t s_C_minor_dim_registers =
+      result_type::s_minor_dim_registers;
+
+  /*
+   * Matrix A (and C) has 1 more more registers per column
+   *
+   */
+  template<typename dummy = void>
+  RAJA_HOST_DEVICE static RAJA_INLINE
+      typename std::enable_if<(s_C_minor_dim_registers != 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
 
-            RAJA_UNROLL
-            for(camp::idx_t c_segment = 0;c_segment < result_type::s_major_dim_per_register;++ c_segment){
+#if defined(RAJA_ENABLE_VECTOR_STATS) && !defined(__CUDA_ARCH__)
+    RAJA::tensor_stats::num_matrix_mm_multacc_row_row++;
+#endif
 
-              register_type c_tmp;
 
-              RAJA_UNROLL
-              for(camp::idx_t a_reg = 0;a_reg < right_type::s_num_registers;++ a_reg){
+    constexpr camp::idx_t num_ac_reg_per_col = s_C_minor_dim_registers;
 
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0; c_reg < result_type::s_num_registers; ++c_reg)
+    {
+      camp::idx_t ac_row_reg = c_reg % num_ac_reg_per_col;
+      camp::idx_t bc_col     = c_reg / num_ac_reg_per_col;
 
-                camp::idx_t b_segment = bc_col*right_type::s_num_registers + a_reg;
-                camp::idx_t b_reg = b_segment / b_segments_per_register;
-                camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
+      RAJA_UNROLL
+      for (camp::idx_t b_row = 0; b_row < M_SIZE; ++b_row)
+      {
+        camp::idx_t a_reg = b_row * num_ac_reg_per_col + ac_row_reg;
 
-                register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(ac_segbits, b_reg_segment);
+        C.get_register(c_reg) =
+            register_type(B.get(b_row, bc_col))
+                .multiply_add(A.get_register(a_reg), C.get_register(c_reg));
+      }
+    }
+  }
 
-                if(a_reg == 0){
-                  c_tmp = b_tmp.multiply(A.get_register(a_reg));
-                }
-                else{
-                  c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
-                }
+  /*
+   * Matrix A (and C) have less than one register per column
+   *
+   */
+  template<typename dummy = void>
+  RAJA_HOST_DEVICE RAJA_INLINE static
+      typename std::enable_if<(s_C_minor_dim_registers == 0), dummy>::type
+      multiply_accumulate(left_type const& A,
+                          right_type const& B,
+                          result_type& C)
+  {
+    constexpr camp::idx_t ac_segbits              = result_type::s_segbits;
+    constexpr camp::idx_t b_segments_per_register = 1 << ac_segbits;
+
+    camp::idx_t bc_col = 0;
+
+    RAJA_UNROLL
+    for (camp::idx_t c_reg = 0;
+         c_reg < N_SIZE / result_type::s_major_dim_per_register; ++c_reg)
+    {
 
-              }
+      RAJA_UNROLL
+      for (camp::idx_t c_segment = 0;
+           c_segment < result_type::s_major_dim_per_register; ++c_segment)
+      {
 
-              C.get_register(c_reg) += c_tmp.segmented_sum_outer(ac_segbits, c_segment);
+        register_type c_tmp;
 
-              ++ bc_col;
-            } // c_segment
-          } // c_reg
+        RAJA_UNROLL
+        for (camp::idx_t a_reg = 0; a_reg < right_type::s_num_registers;
+             ++a_reg)
+        {
 
 
-        }
+          camp::idx_t b_segment = bc_col * right_type::s_num_registers + a_reg;
+          camp::idx_t b_reg     = b_segment / b_segments_per_register;
+          camp::idx_t b_reg_segment = b_segment % b_segments_per_register;
 
+          register_type b_tmp = B.get_register(b_reg).segmented_broadcast_outer(
+              ac_segbits, b_reg_segment);
 
-        RAJA_HOST_DEVICE
-        static
-        RAJA_INLINE
-        void multiply(left_type const &A, right_type const &B, result_type &C){
-          C = result_type(0);
-          self_type::multiply_accumulate(A, B, C);
+          if (a_reg == 0)
+          {
+            c_tmp = b_tmp.multiply(A.get_register(a_reg));
+          }
+          else
+          {
+            c_tmp = b_tmp.multiply_add(A.get_register(a_reg), c_tmp);
+          }
         }
-    };
-
 
+        C.get_register(c_reg) +=
+            c_tmp.segmented_sum_outer(ac_segbits, c_segment);
 
+        ++bc_col;
+      }  // c_segment
+    }    // c_reg
+  }
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
+  RAJA_HOST_DEVICE
+  static RAJA_INLINE void multiply(left_type const& A,
+                                   right_type const& B,
+                                   result_type& C)
+  {
+    C = result_type(0);
+    self_type::multiply_accumulate(A, B, C);
+  }
+};
 
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
index 3036a096b5..ac1b194348 100644
--- a/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/MatrixRegisterImpl.hpp
@@ -24,7 +24,7 @@
 #include "RAJA/pattern/tensor/internal/MatrixMatrixMultiply.hpp"
 #include "RAJA/util/BitMask.hpp"
 
-//#define DEBUG_MATRIX_LOAD_STORE
+// #define DEBUG_MATRIX_LOAD_STORE
 
 
 namespace RAJA
@@ -32,1121 +32,1353 @@ namespace RAJA
 namespace expt
 {
 
-  /*
-   * 2D (Matrix) specialization of TensorRegister
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t ROW_ORD, camp::idx_t COL_ORD, camp::idx_t ROW_SIZE, camp::idx_t COL_SIZE>
-  class TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>> :
-    public RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+/*
+ * 2D (Matrix) specialization of TensorRegister
+ */
+template<typename REGISTER_POLICY,
+         typename T,
+         camp::idx_t ROW_ORD,
+         camp::idx_t COL_ORD,
+         camp::idx_t ROW_SIZE,
+         camp::idx_t COL_SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>
+    : public RAJA::internal::expt::TensorRegisterBase<
+          TensorRegister<REGISTER_POLICY,
+                         T,
+                         TensorLayout<ROW_ORD, COL_ORD>,
+                         camp::idx_seq<ROW_SIZE, COL_SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   TensorLayout<ROW_ORD, COL_ORD>,
+                                   camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+  using base_type = RAJA::internal::expt::TensorRegisterBase<
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<ROW_ORD, COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
+  using register_type      = Register<T, REGISTER_POLICY>;
+  using row_vector_type    = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
+  using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
+  using register_policy    = REGISTER_POLICY;
+  using element_type       = T;
+  using layout_type        = TensorLayout<ROW_ORD, COL_ORD>;
+
+  using transpose_tensor_type =
+      TensorRegister<REGISTER_POLICY,
+                     T,
+                     TensorLayout<!ROW_ORD, !COL_ORD>,
+                     camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+
+  using transpose_type = TensorRegister<REGISTER_POLICY,
+                                        T,
+                                        layout_type,
+                                        camp::idx_seq<COL_SIZE, ROW_SIZE>>;
+  using product_type   = TensorRegister<REGISTER_POLICY,
+                                      T,
+                                      layout_type,
+                                      camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+
+  static constexpr camp::idx_t s_num_rows    = ROW_SIZE;
+  static constexpr camp::idx_t s_num_columns = COL_SIZE;
+
+
+  static constexpr camp::idx_t s_elements_per_register =
+      RAJA::internal::expt::RegisterTraits<REGISTER_POLICY, T>::s_num_elem;
+
+  // number of registers to hold entire matrix
+  static constexpr camp::idx_t s_num_registers =
+      (ROW_SIZE * COL_SIZE) / s_elements_per_register;
+
+  // We only allow matrix sizes that exactly fit in some number of registers
+  static_assert((ROW_SIZE * COL_SIZE) ==
+                    s_num_registers * s_elements_per_register,
+                "MatrixRegister must be dimensioned to exactly fit an integer "
+                "number of registers");
+
+  using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+
+  static constexpr camp::idx_t s_minor_dim_elements =
+      layout_type::is_row_major() ? s_num_columns : s_num_rows;
+
+  static constexpr camp::idx_t s_major_dim_elements =
+      layout_type::is_row_major() ? s_num_rows : s_num_columns;
+
+  // number of (full) registers that span the minor dim
+  // if a single register is split across multiple rows or columns, then
+  // this is 0
+  static constexpr camp::idx_t s_minor_dim_registers =
+      s_minor_dim_elements / s_elements_per_register;
+
+  static_assert(s_minor_dim_registers > 0 || log_base2_t::is_exact,
+                "Minor dimension smaller than a vector need to be a power of "
+                "two fraction");
+
+  static_assert(s_minor_dim_registers == 0 ||
+                    (s_minor_dim_elements % s_elements_per_register == 0),
+                "Minor dimensions greater than a vector length must be an "
+                "integer number of vectors");
+
+
+  static constexpr camp::idx_t s_major_dim_per_register =
+      s_elements_per_register / s_minor_dim_elements;
+
+  static constexpr camp::idx_t s_segbits =
+      RAJA::LogBase2<s_minor_dim_elements>::value;
+
+private:
+  template<typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX row,
+                                                                 IDX col) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
-      using base_type = RAJA::internal::expt::TensorRegisterBase<TensorRegister<REGISTER_POLICY, T, TensorLayout<ROW_ORD, COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>>;
-      using register_type = Register<T, REGISTER_POLICY>;
-      using row_vector_type = VectorRegister<T, REGISTER_POLICY, COL_SIZE>;
-      using column_vector_type = VectorRegister<T, REGISTER_POLICY, ROW_SIZE>;
-      using register_policy = REGISTER_POLICY;
-      using element_type = T;
-      using layout_type = TensorLayout<ROW_ORD, COL_ORD>;
-
-      using transpose_tensor_type = TensorRegister<REGISTER_POLICY, T, TensorLayout<!ROW_ORD, !COL_ORD>, camp::idx_seq<ROW_SIZE, COL_SIZE>>;
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) >> IDX(s_shift_per_register)
+               : (col * IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
+  }
+
+  template<typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX row, IDX col)
+      -> IDX
+  {
+    return layout_type::is_row_major()
+               ? (row * IDX(COL_SIZE) + col) & IDX(s_mask_per_register)
+               : (col * IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
+  }
 
-      using transpose_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<COL_SIZE, ROW_SIZE>>;
-      using product_type = TensorRegister<REGISTER_POLICY, T, layout_type, camp::idx_seq<ROW_SIZE, ROW_SIZE>>;
+  using base_type::m_registers;
 
-      static constexpr camp::idx_t s_num_rows = ROW_SIZE;
-      static constexpr camp::idx_t s_num_columns = COL_SIZE;
+public:
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  constexpr TensorRegister() : base_type() {}
 
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  TensorRegister(element_type c) : base_type(c) { this->broadcast(c); }
 
-      static constexpr camp::idx_t s_elements_per_register =
-          RAJA::internal::expt::RegisterTraits<REGISTER_POLICY,T>::s_num_elem;
+  RAJA_INLINE
 
-      // number of registers to hold entire matrix
-      static constexpr camp::idx_t s_num_registers =
-          (ROW_SIZE*COL_SIZE) / s_elements_per_register;
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) { this->copy(c); }
 
-      // We only allow matrix sizes that exactly fit in some number of registers
-      static_assert((ROW_SIZE*COL_SIZE) == s_num_registers*s_elements_per_register,
-          "MatrixRegister must be dimensioned to exactly fit an integer number of registers");
+  RAJA_HOST_DEVICE
 
-      using log_base2_t = RAJA::LogBase2<s_elements_per_register>;
+  RAJA_INLINE
+  ~TensorRegister() {}
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template<camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
+           (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+  }
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? ROW_SIZE : COL_SIZE;
+  }
 
-      static constexpr camp::idx_t s_minor_dim_elements =
-          layout_type::is_row_major() ? s_num_columns : s_num_rows;
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
 
-      static constexpr camp::idx_t s_major_dim_elements =
-          layout_type::is_row_major() ? s_num_rows : s_num_columns;
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
-      // number of (full) registers that span the minor dim
-      // if a single register is split across multiple rows or columns, then
-      // this is 0
-      static constexpr camp::idx_t s_minor_dim_registers =
-              s_minor_dim_elements / s_elements_per_register;
+  RAJA_HOST_DEVICE
 
-      static_assert(s_minor_dim_registers >0  ||  log_base2_t::is_exact,
-          "Minor dimension smaller than a vector need to be a power of two fraction");
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
-      static_assert(s_minor_dim_registers == 0 || (s_minor_dim_elements % s_elements_per_register == 0),
-          "Minor dimensions greater than a vector length must be an integer number of vectors");
+  /*!
+   * Provide matrix-matrix multiply for operator* between to matrices
+   */
+  template<typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return matrix_multiply(y);
+  }
 
+  /*!
+   * Provide right matrix-vector multiply for operator* between this
+   * matrix and a vector.
+   */
+  template<typename T2, typename RP>
+  VectorRegister<T2, RP> operator*(VectorRegister<T2, RP> const& y) const
+  {
+    return right_multiply_vector(y);
+  }
 
-      static constexpr camp::idx_t s_major_dim_per_register =
-          s_elements_per_register / s_minor_dim_elements;
 
-      static constexpr camp::idx_t s_segbits = RAJA::LogBase2<s_minor_dim_elements>::value;
+  template<typename REF_TYPE>
+  struct RefBridge;
 
-    private:
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) >> IDX(s_shift_per_register) :
-            (col*IDX(ROW_SIZE) + row) >> IDX(s_shift_per_register);
-      }
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX row, IDX col) -> IDX {
-        return layout_type::is_row_major() ?
-            (row*IDX(COL_SIZE) + col) & IDX(s_mask_per_register) :
-            (col*IDX(ROW_SIZE) + row) & IDX(s_mask_per_register);
-      }
+  template<typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+           camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+  {
 
-      using base_type::m_registers;
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
 
-    public:
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
 
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister() : base_type() {}
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c) : base_type(c)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) : base_type(c)
+      // strided data
+      else
       {
-        this->copy(c);
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegister(){}
-
-
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return (STRIDE_ONE_DIM == 0 && layout_type::is_column_major()) ||
-            (STRIDE_ONE_DIM == 1 && layout_type::is_row_major());
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? ROW_SIZE : COL_SIZE;
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
 
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        this->broadcast(value);
-        return *this;
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
+  };
 
+  template<typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+           INDEX_TYPE StrideInt1,
+           INDEX_TYPE StrideInt2,
+           INDEX_TYPE BeginInt1,
+           INDEX_TYPE BeginInt2,
+           INDEX_TYPE SizeInt1,
+           INDEX_TYPE SizeInt2,
+           camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+      camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+      camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+      STRIDE_ONE_DIM>>
+  {
 
-      /*!
-       * Provide matrix-matrix multiply for operator* between to matrices
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, StrideInt1, StrideInt2>,
+        camp::int_seq<INDEX_TYPE, BeginInt1, BeginInt2>,
+        camp::int_seq<INDEX_TYPE, SizeInt1, SizeInt2>,
+        STRIDE_ONE_DIM>;
+
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
+
+    RAJA_HOST_DEVICE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
+
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
+
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
       {
-        return matrix_multiply(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                              ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-      /*!
-       * Provide right matrix-vector multiply for operator* between this
-       * matrix and a vector.
-       */
-      template<typename T2, typename RP>
-      VectorRegister<T2, RP>
-      operator*(VectorRegister<T2, RP> const &y) const
+      // strided data
+      else
       {
-        return right_multiply_vector(y);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
+    }
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_INLINE
 
-      template<typename REF_TYPE>
-      struct RefBridge;
+    RAJA_HOST_DEVICE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0] +
+                 ref.m_tile.m_begin[1] * ref.m_stride[1];
 
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
+      // check for packed data
+      if (self.is_ref_packed<STRIDE_ONE_DIM>())
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                               ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
       }
-
-
-
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>>
+      // strided data
+      else
       {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+          self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
+        }
+        // partial
+        else
+        {
+          self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
+                                ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
+        }
+      }
+    }
+  };
+
+  /*!
+   * Loads a dense full matrix from memory.
+   *
+   * For row-major, column entries must be stride-1
+   * For column-major, row entries must be stride-1
+   *
+   * Non-stride-1 dimension can have any striding... so this is can
+   * be a "semi-dense" matrix.
+   */
+  RAJA_HOST_DEVICE
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 2, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr,
+                         int row_stride,
+                         int col_stride)
+  {
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-      template<
-           typename POINTER_TYPE,
-           typename INDEX_TYPE,
-           RAJA::internal::expt::TensorTileSize TENSOR_SIZE, 
-           INDEX_TYPE StrideInt1, INDEX_TYPE StrideInt2,
-           INDEX_TYPE  BeginInt1, INDEX_TYPE  BeginInt2,
-           INDEX_TYPE   SizeInt1, INDEX_TYPE   SizeInt2,
-           camp::idx_t STRIDE_ONE_DIM
-      >
-      struct RefBridge
-      <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>>
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
       {
+        m_registers[reg].load_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense load for row-major
+    else if (layout_type::is_row_major())
+    {
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInt1,StrideInt2>,camp::int_seq<INDEX_TYPE,BeginInt1,BeginInt2>,camp::int_seq<INDEX_TYPE,SizeInt1,SizeInt2>,STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void load_ref(self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>()){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                    ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.load_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.load_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-    
-    
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_INLINE
-          RAJA_HOST_DEVICE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0] +
-                                       ref.m_tile.m_begin[1]*ref.m_stride[1];
-    
-            // check for packed data
-            if(self.is_ref_packed<STRIDE_ONE_DIM>())
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_packed(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_packed_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-                self.store_strided(ptr, ref.m_stride[0], ref.m_stride[1]);
-              }
-              // partial
-              else{
-                self.store_strided_nm(ptr, ref.m_stride[0], ref.m_stride[1],
-                                         ref.m_tile.m_size[0], ref.m_tile.m_size[1]);
-              }
-            }
-          }
-
-      };
-
-
-
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        camp::idx_t reg = 0;
+        for (camp::idx_t row = 0; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
+            camp::idx_t offset =
+                row * row_stride + colreg * s_elements_per_register;
 
-      /*!
-       * Loads a dense full matrix from memory.
-       *
-       * For row-major, column entries must be stride-1
-       * For column-major, row entries must be stride-1
-       *
-       * Non-stride-1 dimension can have any striding... so this is can
-       * be a "semi-dense" matrix.
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr,
-          int row_stride, int col_stride)
-      {
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+            m_registers[reg].load_packed(ptr + offset);
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].load_packed(ptr + reg*s_elements_per_register);
+            reg++;
           }
-
         }
-        // Do semi-dense load for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            camp::idx_t reg = 0;
-            for(camp::idx_t row = 0;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t offset = row*row_stride + colreg*s_elements_per_register;
+        camp::idx_t reg = 0;
+        for (camp::idx_t col = 0; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                m_registers[reg].load_packed(ptr + offset);
+            camp::idx_t offset =
+                col * col_stride + rowreg * s_elements_per_register;
 
-                reg ++;
+            m_registers[reg].load_packed(ptr + offset);
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
+            reg++;
           }
         }
-        // Do semi-dense load for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            camp::idx_t reg = 0;
-            for(camp::idx_t col = 0;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided(ptr, row_stride, col_stride);
+      }
+    }
 
-                camp::idx_t offset = col*col_stride + rowreg*s_elements_per_register;
+    return *this;
+  }
 
-                m_registers[reg].load_packed(ptr + offset);
+  /*!
+   * Loads a strided full matrix from memory
+   */
+  RAJA_HOST_DEVICE
 
-                reg ++;
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr,
+                          int row_stride,
+                          int col_stride)
+  {
 
-              }
-            }
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided(ptr, row_stride, col_stride);
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      col_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided full matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr,
-          int row_stride, int col_stride)
+      // less than one register per row
+      else
       {
-
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, col_stride,
+                                        row_stride);
         }
+      }
+    }
 
-        // column major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-              m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+
+          m_registers[i].load_strided(ptr + row * row_stride + col * col_stride,
+                                      row_stride);
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a dense partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // less than one register per column
+      else
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load(ptr_i, s_segbits, row_stride,
+                                        col_stride);
+        }
+      }
+    }
 
-        if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
+    return *this;
+  }
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+  /*!
+   * Loads a dense partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_packed_nm(element_type const* ptr,
+                            int row_stride,
+                            int col_stride,
+                            int num_rows,
+                            int num_cols)
+  {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = colreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // loading a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
 
-            // zero out remaining rows
-            for(camp::idx_t row = num_rows;row < ROW_SIZE;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
-
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_cols - col0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = colreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
+
+              break;  // end this row
             }
           }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
-          }
         }
-        // Do semi-dense load for column-major
-        else{
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
+        // zero out remaining rows
+        for (camp::idx_t row = num_rows; row < ROW_SIZE; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            m_registers[reg] = element_type(0);
+          }
+        }
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
+    // Do semi-dense load for column-major
+    else
+    {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].load_packed(ptr + offset);
-                }
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                  // zero out the remaining registers, if any
-                  for(camp::idx_t i = rowreg+1;i < s_minor_dim_registers;++i){
-                    reg++;
-                    m_registers[reg] = element_type(0);
-                  }
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].load_packed(ptr + offset);
             }
-            // zero out remaining columns
-            for(camp::idx_t col = num_cols;col < COL_SIZE;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].load_packed_n(ptr + offset, num_rows - row0);
 
+              // zero out the remaining registers, if any
+              for (camp::idx_t i = rowreg + 1; i < s_minor_dim_registers; ++i)
+              {
+                reg++;
                 m_registers[reg] = element_type(0);
               }
-            }
 
+              break;  // end this column
+            }
           }
-          // more than one column per register
-          else{
+        }
+        // zero out remaining columns
+        for (camp::idx_t col = num_cols; col < COL_SIZE; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-            // default to strided operation
-            return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            m_registers[reg] = element_type(0);
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Loads a strided partial matrix from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_nm(element_type const *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols)
+      // more than one column per register
+      else
       {
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row >= num_rows){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+        // default to strided operation
+        return load_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+      }
+    }
 
+    return *this;
+  }
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
+  /*!
+   * Loads a strided partial matrix from memory
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_strided_nm(element_type const* ptr,
+                             int row_stride,
+                             int col_stride,
+                             int num_rows,
+                             int num_cols)
+  {
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row >= num_rows)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per row
           else
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type const *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type const* ptr_i =
+              ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, col_stride,
+                                           row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col >= num_cols){
-                m_registers[i] = element_type(0);
-              }
-              else{
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
+    // column major
+    else
+    {
 
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].load_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].load_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col >= num_cols)
+          {
+            m_registers[i] = element_type(0);
           }
-          // less than one register per column
           else
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type const *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].load_strided_n(ptr + row * row_stride +
+                                                col * col_stride,
+                                            row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].load_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type const* ptr_i =
+              ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_load_nm(ptr_i, s_segbits, row_stride,
+                                           col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
+  /*!
+   * Store a dense full matrix to memory.
+   *
+   * Column entries must be stride-1, rows may be any striding
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * Store a dense full matrix to memory.
-       *
-       * Column entries must be stride-1, rows may be any striding
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr,
+                                int row_stride,
+                                int col_stride) const
+  {
 
-        // if it's dense in columns and rows, just do a dense load
-        if((layout_type::is_row_major()&&(row_stride==COL_SIZE)) ||
-           (layout_type::is_column_major()&&(col_stride==ROW_SIZE))){
+    // if it's dense in columns and rows, just do a dense load
+    if ((layout_type::is_row_major() && (row_stride == COL_SIZE)) ||
+        (layout_type::is_column_major() && (col_stride == ROW_SIZE)))
+    {
 
-          for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-            m_registers[reg].store_packed(ptr + reg*s_elements_per_register);
-          }
+      for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_elements_per_register);
+      }
+    }
+    // Do semi-dense store for row-major
+    else if (layout_type::is_row_major())
+    {
 
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
-        // Do semi-dense store for row-major
-        else if(layout_type::is_row_major()){
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one column per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
-        }
-        // Do semi-dense store for column-major
-        else{
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_packed(ptr+row*row_stride+col*col_stride);
-            }
-          }
-          // more than one row per register
-          else{
-            store_strided(ptr, row_stride, col_stride);
-          }
+      }
+      // more than one column per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_packed(ptr + row * row_stride +
+                                      col * col_stride);
         }
+      }
+      // more than one row per register
+      else
+      {
+        store_strided(ptr, row_stride, col_stride);
+      }
+    }
 
 
-        return *this;
-      }
+    return *this;
+  }
 
-      /*!
-       * Store a strided full matrix to memory
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr,
-          int row_stride, int col_stride) const
-      {
+  /*!
+   * Store a strided full matrix to memory
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr,
+                                 int row_stride,
+                                 int col_stride) const
+  {
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-            }
-          }
-          // less than one register per row
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, col_stride, row_stride);
-            }
-          }
-        }
 
-        // column major
-        else{
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-              m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-            }
-          }
-          // less than one register per column
-          else
-          {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store(ptr_i, s_segbits, row_stride, col_stride);
-            }
-          }
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t col =
+              s_elements_per_register * (i - (row * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, col_stride);
+        }
+      }
+      // less than one register per row
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, col_stride,
+                                         row_stride);
         }
-
-        return *this;
       }
+    }
 
-      /*!
-       * Store a dense partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+    // column major
+    else
+    {
+      // one or more registers per column
+      if (s_minor_dim_registers)
       {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          camp::idx_t row =
+              s_elements_per_register * (i - (col * s_minor_dim_registers));
+          m_registers[i].store_strided(
+              ptr + row * row_stride + col * col_stride, row_stride);
+        }
+      }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store(ptr_i, s_segbits, row_stride,
+                                         col_stride);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-        if(layout_type::is_row_major()){
+  /*!
+   * Store a dense partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_packed_nm(element_type* ptr,
+                                   int row_stride,
+                                   int col_stride,
+                                   int num_rows,
+                                   int num_cols) const
+  {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
 
-            for(camp::idx_t row = 0;row < num_rows;++ row){
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers; ++ colreg){
+    if (layout_type::is_row_major())
+    {
 
-                camp::idx_t reg = row*s_minor_dim_registers + colreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t col0 = colreg*s_elements_per_register;
-                camp::idx_t offset = row*row_stride + col0;
+        for (camp::idx_t row = 0; row < num_rows; ++row)
+        {
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-                // store a complete register
-                if(col0+s_elements_per_register <= num_cols){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = row * s_minor_dim_registers + colreg;
 
-                // partial register at end of row
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+            camp::idx_t col0   = colreg * s_elements_per_register;
+            camp::idx_t offset = row * row_stride + col0;
 
-                  break; // end this row
-                }
-              }
+            // store a complete register
+            if (col0 + s_elements_per_register <= num_cols)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+            // partial register at end of row
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_cols - col0);
+
+              break;  // end this row
+            }
           }
         }
-        // Do semi-dense store for column-major
-        else{
-
-          // one or more registers per column
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t col = 0;col < num_cols;++ col){
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers; ++ rowreg){
+      }
+      // more than one column per register
+      else
+      {
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
+    // Do semi-dense store for column-major
+    else
+    {
 
-                camp::idx_t reg = col*s_minor_dim_registers + rowreg;
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t row0 = rowreg*s_elements_per_register;
-                camp::idx_t offset = col*col_stride + row0;
+        for (camp::idx_t col = 0; col < num_cols; ++col)
+        {
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-                // loading a complete register
-                if(row0+s_elements_per_register <= num_rows){
-                  m_registers[reg].store_packed(ptr + offset);
-                }
+            camp::idx_t reg = col * s_minor_dim_registers + rowreg;
 
-                // partial register at end of column
-                else{
-                  m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
+            camp::idx_t row0   = rowreg * s_elements_per_register;
+            camp::idx_t offset = col * col_stride + row0;
 
-                  break; // end this column
-                }
-              }
+            // loading a complete register
+            if (row0 + s_elements_per_register <= num_rows)
+            {
+              m_registers[reg].store_packed(ptr + offset);
             }
 
-          }
-          // more than one column per register
-          else{
+            // partial register at end of column
+            else
+            {
+              m_registers[reg].store_packed_n(ptr + offset, num_rows - row0);
 
-            // default to strided operation
-            return store_strided_nm(ptr, row_stride, col_stride, num_rows, num_cols);
+              break;  // end this column
+            }
           }
         }
-
-        return *this;
       }
-
-      /*!
-       * Store a strided partial matrix to memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_nm(element_type *ptr,
-          int row_stride, int col_stride,
-          int num_rows, int num_cols) const
+      // more than one column per register
+      else
       {
 
+        // default to strided operation
+        return store_strided_nm(ptr, row_stride, col_stride, num_rows,
+                                num_cols);
+      }
+    }
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
-
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
+    return *this;
+  }
 
+  /*!
+   * Store a strided partial matrix to memory
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_strided_nm(element_type* ptr,
+                                    int row_stride,
+                                    int col_stride,
+                                    int num_rows,
+                                    int num_cols) const
+  {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, col_stride, reg_num_cols);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, col_stride);
-                }
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-              }
-            }
-          }
-          // less than one register per row
-          else
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
           {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
 
-              element_type *ptr_i = ptr + i * row_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride, row_stride, num_cols, reg_num_rows);
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             col_stride, reg_num_cols);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, col_stride);
             }
           }
         }
+      }
+      // less than one register per row
+      else
+      {
 
-        // column major
-        else{
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
+
+          element_type* ptr_i = ptr + i * row_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, col_stride,
+                                            row_stride, num_cols, reg_num_rows);
+        }
+      }
+    }
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  m_registers[i].store_strided_n(ptr+row*row_stride+col*col_stride, row_stride, reg_num_rows);
-                }
-                else{
-                  m_registers[i].store_strided(ptr+row*row_stride+col*col_stride, row_stride);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+    // column major
+    else
+    {
+
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              element_type *ptr_i = ptr + i * col_stride*s_major_dim_per_register;
-              m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride, col_stride, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              m_registers[i].store_strided_n(ptr + row * row_stride +
+                                                 col * col_stride,
+                                             row_stride, reg_num_rows);
+            }
+            else
+            {
+              m_registers[i].store_strided(
+                  ptr + row * row_stride + col * col_stride, row_stride);
             }
           }
         }
-
-        return *this;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          element_type* ptr_i = ptr + i * col_stride * s_major_dim_per_register;
+          m_registers[i].segmented_store_nm(ptr_i, s_segbits, row_stride,
+                                            col_stride, num_rows, reg_num_cols);
+        }
+      }
+    }
 
+    return *this;
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_nm(self_type const &mat, int num_rows, int num_cols) const {
-        self_type result;
-
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-        if(layout_type::is_row_major()){
-          // one or more registers per row
-          if(s_minor_dim_registers){
+  RAJA_INLINE
+  self_type divide_nm(self_type const& mat, int num_rows, int num_cols) const
+  {
+    self_type result;
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t row = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(row < num_rows){
-                camp::idx_t col = s_elements_per_register * (i - (row*s_minor_dim_registers));
 
+    if (layout_type::is_row_major())
+    {
+      // one or more registers per row
+      if (s_minor_dim_registers)
+      {
 
-                camp::idx_t reg_num_cols = s_elements_per_register;
-                if(reg_num_cols+col > num_cols){
-                  reg_num_cols = num_cols-col;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t row =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (row < num_rows)
+          {
+            camp::idx_t col =
+                s_elements_per_register * (i - (row * s_minor_dim_registers));
 
 
-              }
+            camp::idx_t reg_num_cols = s_elements_per_register;
+            if (reg_num_cols + col > num_cols)
+            {
+              reg_num_cols = num_cols - col;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_cols);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
-          // less than one register per row
-          else
-          {
+        }
+      }
+      // less than one register per row
+      else
+      {
 
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many rows get loaded in this register
-              camp::idx_t reg_num_rows = num_rows - i*s_major_dim_per_register;
-              reg_num_rows = reg_num_rows > s_major_dim_per_register ? s_major_dim_per_register : reg_num_rows;
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many rows get loaded in this register
+          camp::idx_t reg_num_rows = num_rows - i * s_major_dim_per_register;
+          reg_num_rows             = reg_num_rows > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_rows;
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
-            }
-          }
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_cols, reg_num_rows);
         }
+      }
+    }
 
-        // column major
-        else{
+    // column major
+    else
+    {
 
-          // one or more registers per column
-          if(s_minor_dim_registers){
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              camp::idx_t col = i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
-              if(col < num_cols){
-                camp::idx_t row = s_elements_per_register * (i - (col*s_minor_dim_registers));
-
-                camp::idx_t reg_num_rows = s_elements_per_register;
-                if(reg_num_rows+row > num_rows){
-                  reg_num_rows = num_rows-row;
-                  result.m_registers[i] = m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
-                }
-                else{
-                  result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
-                }
-              }
-            }
-          }
-          // less than one register per column
-          else
+      // one or more registers per column
+      if (s_minor_dim_registers)
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          camp::idx_t col =
+              i / (s_minor_dim_registers ? s_minor_dim_registers : 1);
+          if (col < num_cols)
           {
-            for(camp::idx_t i = 0;i < s_num_registers;++ i){
-              // figure out how many columns get loaded in this register
-              camp::idx_t reg_num_cols = num_cols - i*s_major_dim_per_register;
-              reg_num_cols = reg_num_cols > s_major_dim_per_register ? s_major_dim_per_register : reg_num_cols;
+            camp::idx_t row =
+                s_elements_per_register * (i - (col * s_minor_dim_registers));
 
-              result.m_registers[i] = m_registers[i].segmented_divide_nm(mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+            camp::idx_t reg_num_rows = s_elements_per_register;
+            if (reg_num_rows + row > num_rows)
+            {
+              reg_num_rows = num_rows - row;
+              result.m_registers[i] =
+                  m_registers[i].divide_n(mat.m_registers[i], reg_num_rows);
+            }
+            else
+            {
+              result.m_registers[i] = m_registers[i].divide(mat.m_registers[i]);
             }
           }
         }
-
-
-        return result;
       }
+      // less than one register per column
+      else
+      {
+        for (camp::idx_t i = 0; i < s_num_registers; ++i)
+        {
+          // figure out how many columns get loaded in this register
+          camp::idx_t reg_num_cols = num_cols - i * s_major_dim_per_register;
+          reg_num_cols             = reg_num_cols > s_major_dim_per_register
+                                         ? s_major_dim_per_register
+                                         : reg_num_cols;
+
+          result.m_registers[i] = m_registers[i].segmented_divide_nm(
+              mat.m_registers[i], s_segbits, num_rows, reg_num_cols);
+        }
+      }
+    }
 
 
+    return result;
+  }
 
-      /*!
-       * Matrix transpose, keeping layout
-       *
-       * Transpose is not completely implemented
-       */
+  /*!
+   * Matrix transpose, keeping layout
+   *
+   * Transpose is not completely implemented
+   */
 #if 0
       RAJA_HOST_DEVICE
       RAJA_INLINE
@@ -1291,386 +1523,430 @@ namespace expt
         return reinterpret_cast<transpose_tensor_type const &>(*this);
       }
 #endif
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector(row_vector_type v) const {
-        column_vector_type result(0);
-        return right_multiply_vector_accumulate(v, result);
-      }
-
-      /*!
-       * Matrix vector product
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector(column_vector_type v) const {
-        row_vector_type result(0);
-        return left_multiply_vector_accumulate(v, result);
-      }
-
-
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += (this) * v
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      column_vector_type right_multiply_vector_accumulate(row_vector_type const &v, column_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
-
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
-
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+  RAJA_INLINE
+  column_vector_type right_multiply_vector(row_vector_type v) const
+  {
+    column_vector_type result(0);
+    return right_multiply_vector_accumulate(v, result);
+  }
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+  /*!
+   * Matrix vector product
+   */
+  RAJA_HOST_DEVICE
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+  RAJA_INLINE
+  row_vector_type left_multiply_vector(column_vector_type v) const
+  {
+    row_vector_type result(0);
+    return left_multiply_vector_accumulate(v, result);
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += (this) * v
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+  RAJA_INLINE
+  column_vector_type right_multiply_vector_accumulate(
+      row_vector_type const& v,
+      column_vector_type result) const
+  {
 
-          }
-          // one or more registers per row
-          else{
+    if (layout_type::is_row_major())
+    {
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute partial dot products for all registers in this row
-              auto rowsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-                rowsum = m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
-                reg ++;
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              } // rowreg
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(row) + rowsum.sum();
-              result.set(value, row);
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
-            } // row
-          }
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
+          // accumulate result
+          result.get_register(result_reg) += value;
         }
-        else{
-
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-
-            auto &mv = result.get_register(0);
-
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+      }
+      // one or more registers per row
+      else
+      {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              mv = m_registers[m_reg].multiply_add(v_tmp, mv);
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
 
-            }
+          // compute partial dot products for all registers in this row
+          auto rowsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-            // Now sum segments in mv together to form final result
-            mv = mv.segmented_sum_outer(s_segbits, 0);
+            rowsum =
+                m_registers[reg].multiply_add(v.get_register(colreg), rowsum);
+            reg++;
 
-          }
-          // one or more registers per column
-          else{
+          }  // rowreg
 
-            // Loop over columns (which is also registers)
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(row) + rowsum.sum();
+          result.set(value, row);
 
-              // extract column value from v
-              auto v_col = register_type(v.get(col));
+        }  // row
+      }
+    }
+    else
+    {
 
-              // apply v_col to entire column (1 or more registers)
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
 
-                auto &mv = result.get_register(rowreg);
-                mv = m_registers[reg].multiply_add(v_col, mv);
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
 
-                reg ++;
+        auto& mv = result.get_register(0);
 
-              } // rowreg
-            } // col
-          }
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
 
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          mv = m_registers[m_reg].multiply_add(v_tmp, mv);
         }
-        return result;
-      }
 
-      /*!
-       * Matrix vector product with accumulation into another vector
-       *
-       * acc += v * (this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      row_vector_type left_multiply_vector_accumulate(column_vector_type const &v, row_vector_type result) const {
-
-        if(layout_type::is_row_major()){
-
-          // 1 register is split over multiple columns
-          if(s_minor_dim_registers == 0){
-            auto &vm = result.get_register(0);
+        // Now sum segments in mv together to form final result
+        mv = mv.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per column
+      else
+      {
 
-            // Loop over registers, which are also the segments in v
-            RAJA_UNROLL
-            for(camp::idx_t m_reg = 0;m_reg < s_num_registers;++ m_reg){
-              camp::idx_t v_reg = m_reg >> s_segbits;
-              camp::idx_t v_seg = m_reg & ( (1<<s_segbits) - 1);
+        // Loop over columns (which is also registers)
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
 
-              auto v_tmp = v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
-              vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+          // extract column value from v
+          auto v_col = register_type(v.get(col));
 
-            }
+          // apply v_col to entire column (1 or more registers)
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
 
-            // Now sum segments in mv together to form final result
-            vm = vm.segmented_sum_outer(s_segbits, 0);
+            auto& mv = result.get_register(rowreg);
+            mv       = m_registers[reg].multiply_add(v_col, mv);
 
-          }
-          // one or more registers per row
-          else{
+            reg++;
 
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t row = 0;row < s_num_rows;++ row){
-              auto lhs_bcat = register_type(v.get(row));
-              RAJA_UNROLL
-              for(camp::idx_t colreg = 0;colreg < s_minor_dim_registers;++ colreg){
+          }  // rowreg
+        }    // col
+      }
+    }
+    return result;
+  }
+
+  /*!
+   * Matrix vector product with accumulation into another vector
+   *
+   * acc += v * (this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-                result.get_register(colreg) =
-                    m_registers[reg].multiply_add(lhs_bcat, result.get_register(colreg));
-                reg ++;
+  RAJA_INLINE
+  row_vector_type left_multiply_vector_accumulate(column_vector_type const& v,
+                                                  row_vector_type result) const
+  {
 
-              } // rowreg
+    if (layout_type::is_row_major())
+    {
 
-            }
+      // 1 register is split over multiple columns
+      if (s_minor_dim_registers == 0)
+      {
+        auto& vm = result.get_register(0);
+
+        // Loop over registers, which are also the segments in v
+        RAJA_UNROLL
+        for (camp::idx_t m_reg = 0; m_reg < s_num_registers; ++m_reg)
+        {
+          camp::idx_t v_reg = m_reg >> s_segbits;
+          camp::idx_t v_seg = m_reg & ((1 << s_segbits) - 1);
+
+          auto v_tmp =
+              v.get_register(v_reg).segmented_broadcast_outer(s_segbits, v_seg);
+          vm = m_registers[m_reg].multiply_add(v_tmp, vm);
+        }
 
-          }
+        // Now sum segments in mv together to form final result
+        vm = vm.segmented_sum_outer(s_segbits, 0);
+      }
+      // one or more registers per row
+      else
+      {
 
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t row = 0; row < s_num_rows; ++row)
+        {
+          auto lhs_bcat = register_type(v.get(row));
+          RAJA_UNROLL
+          for (camp::idx_t colreg = 0; colreg < s_minor_dim_registers; ++colreg)
+          {
 
-        } // row-major
+            result.get_register(colreg) = m_registers[reg].multiply_add(
+                lhs_bcat, result.get_register(colreg));
+            reg++;
 
-        // Column-major:
-        else{
-          // 1 register is split over multiple rows
-          if(s_minor_dim_registers == 0){
+          }  // rowreg
+        }
+      }
 
-            // start by broadcasting the first segment in v across all of v
-            // we will use this term for all registers in the matrix
-            auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-            // loop over output segments, which is also the number of
-            // registers in the matrix (no kidding!)
-            RAJA_UNROLL
-            for(camp::idx_t outseg = 0;outseg < s_num_registers;++ outseg){
+    }  // row-major
 
-              // compute which result register we are accumulating into
-              camp::idx_t result_reg = outseg >> s_segbits;
+    // Column-major:
+    else
+    {
+      // 1 register is split over multiple rows
+      if (s_minor_dim_registers == 0)
+      {
 
-              // compute which segment within result_reg we are accumulating into
-              camp::idx_t result_seg = outseg - (result_reg<<s_segbits);
+        // start by broadcasting the first segment in v across all of v
+        // we will use this term for all registers in the matrix
+        auto vv = v.get_register(0).segmented_broadcast_inner(s_segbits, 0);
 
-              // compute segmented dot product to get output segment
-              auto value = m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
+        // loop over output segments, which is also the number of
+        // registers in the matrix (no kidding!)
+        RAJA_UNROLL
+        for (camp::idx_t outseg = 0; outseg < s_num_registers; ++outseg)
+        {
 
-              // accumulate result
-              result.get_register(result_reg) += value;
-            }
+          // compute which result register we are accumulating into
+          camp::idx_t result_reg = outseg >> s_segbits;
 
-          }
-          // one or more registers per column
-          else{
-            // Loop over rows
-            camp::idx_t reg = 0;
-            RAJA_UNROLL
-            for(camp::idx_t col = 0;col < s_num_columns;++ col){
-
-              // compute partial dot products for all registers in this row
-              auto colsum = register_type(0);
-              RAJA_UNROLL
-              for(camp::idx_t rowreg = 0;rowreg < s_minor_dim_registers;++ rowreg){
-                colsum = m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
-                reg ++;
-
-              } // rowreg
-
-              // finish dot product by taking sum of rowsum
-              auto value = result.get(col) + colsum.sum();
-              result.set(value, col);
-
-            } // col
-          }
+          // compute which segment within result_reg we are accumulating into
+          camp::idx_t result_seg = outseg - (result_reg << s_segbits);
 
+          // compute segmented dot product to get output segment
+          auto value =
+              m_registers[outseg].segmented_dot(s_segbits, result_seg, vv);
 
-        } // col-major
-        return result;
+          // accumulate result
+          result.get_register(result_reg) += value;
+        }
       }
+      // one or more registers per column
+      else
+      {
+        // Loop over rows
+        camp::idx_t reg = 0;
+        RAJA_UNROLL
+        for (camp::idx_t col = 0; col < s_num_columns; ++col)
+        {
+
+          // compute partial dot products for all registers in this row
+          auto colsum = register_type(0);
+          RAJA_UNROLL
+          for (camp::idx_t rowreg = 0; rowreg < s_minor_dim_registers; ++rowreg)
+          {
+            colsum =
+                m_registers[reg].multiply_add(v.get_register(rowreg), colsum);
+            reg++;
 
+          }  // rowreg
 
+          // finish dot product by taking sum of rowsum
+          auto value = result.get(col) + colsum.sum();
+          result.set(value, col);
 
-
-
-      /*!
-       * Matrix-Matrix product
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply(RMAT const &mat) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(0);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply(*this, mat, res);
-        return res;
+        }  // col
       }
 
-      /*!
-       * Matrix-Matrix multiply add
-       */
-      template<typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
-      matrix_multiply_add(RMAT const &B, typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type const &C) const {
-        typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type res(C);
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, res);
-        return res;
-      }
 
-      /*!
-       * Matrix-Matrix multiply accumulate
-       */
-      template<typename ACCMAT, typename RMAT>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      void
-      matrix_multiply_accumulate(ACCMAT &acc, RMAT const &B) const {
-        RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type,RMAT>::multiply_accumulate(*this, B, acc);
-      }
+    }  // col-major
+    return result;
+  }
 
+  /*!
+   * Matrix-Matrix product
+   */
+  template<typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply(RMAT const& mat) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(0);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<self_type, RMAT>::multiply(
+        *this, mat, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply add
+   */
+  template<typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE typename RAJA::internal::expt::
+      MatrixMatrixMultiplyHelper<self_type, RMAT>::result_type
+      matrix_multiply_add(
+          RMAT const& B,
+          typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+              self_type,
+              RMAT>::result_type const& C) const
+  {
+    typename RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::result_type res(C);
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, res);
+    return res;
+  }
+
+  /*!
+   * Matrix-Matrix multiply accumulate
+   */
+  template<typename ACCMAT, typename RMAT>
+  RAJA_HOST_DEVICE RAJA_INLINE void matrix_multiply_accumulate(
+      ACCMAT& acc,
+      RMAT const& B) const
+  {
+    RAJA::internal::expt::MatrixMatrixMultiplyHelper<
+        self_type, RMAT>::multiply_accumulate(*this, B, acc);
+  }
 
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int row, int col){
-        m_registers[to_register(row, col)].set(val, to_lane(row,col));
-        return *this;
-      }
+  RAJA_INLINE
+  self_type& set(element_type val, int row, int col)
+  {
+    m_registers[to_register(row, col)].set(val, to_lane(row, col));
+    return *this;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int row, int col) const {
-        return m_registers[to_register(row, col)].get(to_lane(row,col));
-      }
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  element_type get(int row, int col) const
+  {
+    return m_registers[to_register(row, col)].get(to_lane(row, col));
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type extract_diagonal_register(camp::idx_t starting_column, camp::idx_t segbits, camp::idx_t segment) const {
+  RAJA_HOST_DEVICE
 
-        register_type result(0);
+  RAJA_INLINE
+  register_type extract_diagonal_register(camp::idx_t starting_column,
+                                          camp::idx_t segbits,
+                                          camp::idx_t segment) const
+  {
 
-        camp::idx_t num_rows = register_type::s_num_elem >> segbits;
-        camp::idx_t num_repeats = 1 << segbits;
+    register_type result(0);
 
-        camp::idx_t col0 = (starting_column + num_rows*segment)%s_num_columns;
-        camp::idx_t row0 = num_rows*segment;
+    camp::idx_t num_rows    = register_type::s_num_elem >> segbits;
+    camp::idx_t num_repeats = 1 << segbits;
 
-        for(camp::idx_t i = 0;i < num_rows;++i){
-          camp::idx_t col = (col0 + i) % s_num_columns;
-          camp::idx_t row = row0 + i;
-          auto value = get(row,col);
-          for(camp::idx_t j = 0;j < num_repeats;++j){
-            result.set(value, (i<<segbits) + j);
-          }
-        }
+    camp::idx_t col0 = (starting_column + num_rows * segment) % s_num_columns;
+    camp::idx_t row0 = num_rows * segment;
 
-        return result;
+    for (camp::idx_t i = 0; i < num_rows; ++i)
+    {
+      camp::idx_t col = (col0 + i) % s_num_columns;
+      camp::idx_t row = row0 + i;
+      auto value      = get(row, col);
+      for (camp::idx_t j = 0; j < num_repeats; ++j)
+      {
+        result.set(value, (i << segbits) + j);
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * @brief Converts to matrix to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string(bool one_line=false) const {
-        std::string s = "Matrix(" + std::to_string(s_num_rows) +
-            "x" + std::to_string(s_num_columns);
-        if(!one_line){
-          s +=")\n";
-        }
+  /*!
+   * @brief Converts to matrix to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string(bool one_line = false) const
+  {
+    std::string s = "Matrix(" + std::to_string(s_num_rows) + "x" +
+                    std::to_string(s_num_columns);
+    if (!one_line)
+    {
+      s += ")\n";
+    }
 
 
-        s += "[ ";
+    s += "[ ";
 
-        //
-        for(camp::idx_t r = 0;r < s_num_rows; ++ r){
-          if(r > 0){
-            s += ", ";
-            if(!one_line){
-              s+= "\n  ";
-            }
-          }
-          s += "[";
-          for(camp::idx_t c = 0;c < s_num_columns; ++ c){
-            if(c > 0){
-              s += ", ";
-            }
-            s += std::to_string(this->get(r,c));
-          }
-          s += "]";
+    //
+    for (camp::idx_t r = 0; r < s_num_rows; ++r)
+    {
+      if (r > 0)
+      {
+        s += ", ";
+        if (!one_line)
+        {
+          s += "\n  ";
         }
-
-        s += " ]";
-        if(!one_line){
-          s+="\n";
+      }
+      s += "[";
+      for (camp::idx_t c = 0; c < s_num_columns; ++c)
+      {
+        if (c > 0)
+        {
+          s += ", ";
         }
-        return s;
+        s += std::to_string(this->get(r, c));
       }
+      s += "]";
+    }
 
-  }; // MatrixRegisterImpl
+    s += " ]";
+    if (!one_line)
+    {
+      s += "\n";
+    }
+    return s;
+  }
 
+};  // MatrixRegisterImpl
 
 
-
-
-} // namespace expt
+}  // namespace expt
 }  // namespace RAJA
 
 
-
-
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
index 3480fda10c..63a0da3f1e 100644
--- a/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/RegisterBase.hpp
@@ -33,1184 +33,1215 @@ namespace RAJA
 {
 namespace expt
 {
-  template<typename T, typename REGISTER_POLICY>
-  class Register;
+template<typename T, typename REGISTER_POLICY>
+class Register;
 }
 
 namespace internal
 {
 namespace expt
 {
-  class RegisterConcreteBase {};
+class RegisterConcreteBase
+{};
 
+/*
+ * Overload for:    arithmetic + TensorRegister
 
-  /*
-   * Overload for:    arithmetic + TensorRegister
-
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator+(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).add(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator+(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).add(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic - TensorRegister
+/*
+ * Overload for:    arithmetic - TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator-(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).subtract(rhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator-(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).subtract(rhs);
+}
 
-  /*
-   * Overload for:    arithmetic * TensorRegister
+/*
+ * Overload for:    arithmetic * TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator*(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return rhs.scale(lhs);
-  }
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator*(LEFT const& lhs, RIGHT const& rhs)
+{
+  return rhs.scale(lhs);
+}
 
-  /*
-   * Overload for:    arithmetic / TensorRegister
+/*
+ * Overload for:    arithmetic / TensorRegister
 
-   */
-  template<typename LEFT, typename RIGHT,
+ */
+template<
+    typename LEFT,
+    typename RIGHT,
     typename std::enable_if<std::is_arithmetic<LEFT>::value, bool>::type = true,
-    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value, bool>::type = true>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  RIGHT operator/(LEFT const &lhs, RIGHT const &rhs)
-  {
-    return RIGHT(lhs).divide(rhs);
-  }
-
-
+    typename std::enable_if<std::is_base_of<RegisterConcreteBase, RIGHT>::value,
+                            bool>::type                                  = true>
+RAJA_INLINE RAJA_HOST_DEVICE RIGHT operator/(LEFT const& lhs, RIGHT const& rhs)
+{
+  return RIGHT(lhs).divide(rhs);
+}
 
 
+/*!
+ * Register base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template<typename Derived>
+class RegisterBase;
 
-  /*!
-   * Register base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
-   */
-  template<typename Derived>
-  class RegisterBase;
+template<typename T, typename REGISTER_POLICY>
+class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>>
+    : public RegisterConcreteBase
+{
+public:
+  using self_type    = RAJA::expt::Register<T, REGISTER_POLICY>;
+  using element_type = camp::decay<T>;
 
-  template<typename T, typename REGISTER_POLICY>
-  class RegisterBase<RAJA::expt::Register<T, REGISTER_POLICY>> :
-    public RegisterConcreteBase
-  {
-    public:
-      using self_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-      using element_type = camp::decay<T>;
+  using index_type = camp::idx_t;
 
-      using index_type = camp::idx_t;
+  using int_element_type =
+      typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
+  using int_vector_type =
+      RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
 
-      using int_element_type = typename RegisterTraits<REGISTER_POLICY, T>::int_element_type;
-      using int_vector_type = RAJA::expt::Register<int_element_type, REGISTER_POLICY>;
+private:
+  RAJA_HOST_DEVICE
 
-    private:
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
+  {
+    return static_cast<self_type const*>(this);
+  }
 
-    public:
+public:
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return true;
-      }
+  RAJA_INLINE
+  static constexpr bool is_root() { return true; }
 
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(){}
+  RAJA_INLINE
+  constexpr RegisterBase() {}
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~RegisterBase(){}
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  ~RegisterBase() {}
 
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      RegisterBase(RegisterBase const &){}
+  RAJA_INLINE
+  constexpr RegisterBase(RegisterBase const&) {}
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      RegisterBase(self_type const &){
-      }
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  constexpr RegisterBase(self_type const&) {}
 
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type s_broadcast_n(element_type const &value, camp::idx_t N){
-        self_type x;
-        for(camp::idx_t i = 0;i < N;++ i){
-          x.set(value, i);
-        }
-        return x;
-      }
+  RAJA_INLINE
+  static self_type s_broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    self_type x;
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      x.set(value, i);
+    }
+    return x;
+  }
 
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> offsets){
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather(
+      element_type const* ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          getThis()->set(ptr[offsets.get(i)], i);
-        }
-        return *getThis();
-      }
-
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N){
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& gather_n(
+      element_type const* ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+      camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-          for(camp::idx_t i = 0;i < N;++ i){
-            getThis()->set(ptr[offsets.get(i)], i);
-          }
-          return *getThis();
-      }
-
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(ptr[offsets.get(i)], i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    getThis()->gather(ptr, self_type::s_segmented_offsets(segbits, stride_inner,
+                                                          stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
 
-            if(seg >= num_outer || i >= num_inner){
-              getThis()->set(element_type(0), lane);
-            }
-            else{
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              element_type value = ptr[offset];
+        if (seg >= num_outer || i >= num_inner)
+        {
+          getThis()->set(element_type(0), lane);
+        }
+        else
+        {
 
-              getThis()->set(value, lane);
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            }
+          element_type value = ptr[offset];
 
-            lane ++;
-          }
+          getThis()->set(value, lane);
         }
 
-        return *getThis();
+        lane++;
       }
+    }
 
+    return *getThis();
+  }
 
-
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets) const {
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter(
+      element_type* ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, RAJA::expt::Register<T2, REGISTER_POLICY> const &offsets, camp::idx_t N) const {
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& scatter_n(
+      element_type* ptr,
+      RAJA::expt::Register<T2, REGISTER_POLICY> const& offsets,
+      camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[offsets.get(i)] = getThis()->get(i);
-        }
-        return *getThis();
-      }
-
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[offsets.get(i)] = getThis()->get(i);
+    }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        getThis()->scatter(ptr, self_type::s_segmented_offsets(segbits, stride_inner, stride_outer));
-        return *getThis();
-      }
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    getThis()->scatter(ptr, self_type::s_segmented_offsets(
+                                segbits, stride_inner, stride_outer));
+    return *getThis();
+  }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_HOST_DEVICE
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
 
-            if(!(seg >= num_outer || i >= num_inner)){
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-              camp::idx_t offset = seg*stride_outer + i*stride_inner;
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
+      {
 
-              ptr[offset] = getThis()->get(lane);
+        if (!(seg >= num_outer || i >= num_inner))
+        {
 
-            }
+          camp::idx_t offset = seg * stride_outer + i * stride_inner;
 
-            lane ++;
-          }
+          ptr[offset] = getThis()->get(lane);
         }
 
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
+        lane++;
       }
+    }
 
-      /*!
-       * @brief Set entire register to a single scalar value
-       * @param value Value to set all register elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(RAJA::expt::Register<T2, RAJA::expt::scalar_register> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
+    return *getThis();
+  }
 
-      /*!
-       * @brief Assign one register to another
-       * @param x register to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Set entire register to a single scalar value
+   * @param value Value to set all register elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator=(
+      RAJA::expt::Register<T2, RAJA::expt::scalar_register> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
 
+  /*!
+   * @brief Assign one register to another
+   * @param x register to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Add two registers
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
+  /*!
+   * @brief Add two registers
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Add a register to this register
-       * @param x register to add
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Add a register to this register
+   * @param x register to add
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Add scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Add scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Add a scalar to this register
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       *
-       * This broadcasts the scalar to all lanes, then adds to this register
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
 
-      /*!
-       * @brief Negate the value of this register
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
+  /*!
+   * @brief Add a scalar to this register
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   *
+   * This broadcasts the scalar to all lanes, then adds to this register
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Subtract two register registers
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Subtract a register from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Negate the value of this register
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
 
-      /*!
-       * @brief Subtract a scalar from this register
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract two register registers
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Multiply two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(RHS const &rhs) const
-      {
-        return getThis()->multiply(rhs);
-      }
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Multiply a register with this register
-       * @param x register to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = getThis()->multiply(rhs);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract a register from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Divide two register registers, element wise
-       * @param x register to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x register to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
 
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
+  /*!
+   * @brief Subtract a scalar from this register
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Divide this register by another register
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
+  /*!
+   * @brief Multiply two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator*(RHS const& rhs) const
+  {
+    return getThis()->multiply(rhs);
+  }
 
-      /*!
-       * @brief Divide n elements of this register by another register
-       * @param x register to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b.get(i), i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Multiply a register with this register
+   * @param x register to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() = getThis()->multiply(rhs);
+    return *getThis();
+  }
 
-      /*!
-       * @brief Divide n elements of this register by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*getThis());
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(getThis()->get(i) / b, i);
-        }
-        return q;
-      }
+  /*!
+   * @brief Divide two register registers, element wise
+   * @param x register to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-      /*!
-       * @brief Dot product of two registers
-       * @param x Other register to dot with this register
-       * @return Value of (*this) dot x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        return getThis()->multiply(x).sum();
-      }
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
 
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return (self_type(*getThis()) * self_type(b)) + self_type(c);
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x register to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-      /*!
-       * Minimum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(camp::idx_t N) const
-      {
-        return getThis()->min(N);
-      }
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
 
-      /*!
-       * Maximum value across first N lanes of register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(camp::idx_t N) const
-      {
-        return getThis()->max(N);
-      }
+  /*!
+   * @brief Divide this register by another register
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle left operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
-       *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
-       *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_left(camp::idx_t lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
-        self_type z;
+  /*!
+   * @brief Divide n elements of this register by another register
+   * @param x register to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+  /*!
+   * @brief Divide n elements of this register by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*getThis());
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(getThis()->get(i) / b, i);
+    }
+    return q;
+  }
 
-          z.set(xy_select == 0 ? x.get(i) : y.get(i - (1<<lvl)), i);
-        }
+  /*!
+   * @brief Dot product of two registers
+   * @param x Other register to dot with this register
+   * @return Value of (*this) dot x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-        return z;
-      }
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    return getThis()->multiply(x).sum();
+  }
 
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-      /*!
-       * Provides vector-level building block for matrix transpose operations.
-       *
-       * This is a non-optimized reference version which will be used if
-       * no architecture specialized version is supplied
-       *
-       * This is a permute-and-shuffle right operation
-       *
-       *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
-       *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
-       *
-       *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
-       *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
-       *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type transpose_shuffle_right(int lvl, self_type const &y) const
-      {
-        auto const &x = *getThis();
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return (self_type(*getThis()) * self_type(b)) + self_type(c);
+  }
 
-        self_type z;
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-        camp::idx_t i0 = 1<<lvl;
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem;++ i){
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-          // extract value x or y
-          camp::idx_t xy_select = (i >> lvl) & 0x1;
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
-          z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1<<lvl)), i);
-        }
+  /*!
+   * Minimum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-        return z;
-      }
+  RAJA_HOST_DEVICE
+  element_type min_n(camp::idx_t N) const { return getThis()->min(N); }
 
+  /*!
+   * Maximum value across first N lanes of register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  element_type max_n(camp::idx_t N) const { return getThis()->max(N); }
 
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle left operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x0  y0  x2  y2  x4  y4  x6  y6...
+   *  lvl=1    Z=   x0  x1  y0  y1  x4  x5  y4  y5...
+   *  lvl=2    Z=   x0  x1  x2  x3  y0  y1  y2  y3...
+   */
+  RAJA_INLINE
 
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_left(camp::idx_t lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+    self_type z;
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
-            result.set(seg*stride_outer + i*stride_inner, lane);
-            lane ++;
-          }
-        }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-        return result;
-      }
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
 
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
+      z.set(xy_select == 0 ? x.get(i) : y.get(i - (1 << lvl)), i);
+    }
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * self_type::s_num_elem>>segbits;
+    return z;
+  }
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          auto value = getThis()->get(i) + result.get((i >> segbits)+output_offset);
-          result.set(value, (i >> segbits)+output_offset);
-        }
+  /*!
+   * Provides vector-level building block for matrix transpose operations.
+   *
+   * This is a non-optimized reference version which will be used if
+   * no architecture specialized version is supplied
+   *
+   * This is a permute-and-shuffle right operation
+   *
+   *           X=   x0  x1  x2  x3  x4  x5  x6  x7...
+   *           Y=   y0  y1  y2  y3  y4  y5  y6  y7...
+   *
+   *  lvl=0    Z=   x1  y1  x3  y3  x5  y5  x7  y7...
+   *  lvl=1    Z=   x2  x3  y2  y3  x6  x7  y6  y7...
+   *  lvl=2    Z=   x4  x5  x6  x7  y4  y5  y6  y7...
+   */
+  RAJA_INLINE
 
-        return result;
-      }
+  RAJA_HOST_DEVICE
+  self_type transpose_shuffle_right(int lvl, self_type const& y) const
+  {
+    auto const& x = *getThis();
 
-      /*!
-       * Sum all segments as subvectors, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 the segments are size 1, which means that this is just a
-       *      sum of all elements.  The output_segment determines where the
-       *      result is placed.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=3:
-       *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
-       *
-       *  segbits=1 the segments are 2-wide:
-       *
-       *      output_segment=0:
-       *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
-       *
-       *  and so on up to segbits=3, which is just the original vector:
-       *  segbits=3
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-        self_type result(0);
+    self_type z;
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        int output_offset = output_segment * (1<<segbits);
+    camp::idx_t i0 = 1 << lvl;
 
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          camp::idx_t output_i = output_offset + (i&((1<<segbits)-1));
-          auto value = getThis()->get(i) + result.get(output_i);
-          result.set(value, output_i);
-        }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-        return result;
-      }
+      // extract value x or y
+      camp::idx_t xy_select = (i >> lvl) & 0x1;
 
+      z.set(xy_select == 0 ? x.get(i0 + i) : y.get(i0 + i - (1 << lvl)), i);
+    }
 
+    return z;
+  }
 
-      RAJA_INLINE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
+
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
       {
-        self_type result;
+        result.set(seg * stride_outer + i * stride_inner, lane);
+        lane++;
+      }
+    }
 
-        camp::idx_t num_segments = self_type::s_num_elem >> segbits;
-        camp::idx_t seg_size = 1 << segbits;
+    return result;
+  }
 
-        camp::idx_t lane = 0;
-        for(camp::idx_t seg = 0;seg < num_segments; ++ seg){
-          for(camp::idx_t i = 0;i < seg_size; ++ i){
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-            if(seg >= num_outer || i >= num_inner){
-              result.set(element_type(0), lane);
-            }
-            else{
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * self_type::s_num_elem >> segbits;
 
-              element_type div = getThis()->get(lane) / den.get(lane);
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      auto value =
+          getThis()->get(i) + result.get((i >> segbits) + output_offset);
+      result.set(value, (i >> segbits) + output_offset);
+    }
 
-              result.set(div, lane);
+    return result;
+  }
 
-            }
+  /*!
+   * Sum all segments as subvectors, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 the segments are size 1, which means that this is just a
+   *      sum of all elements.  The output_segment determines where the
+   *      result is placed.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=3:
+   *      Result= 0, 0, x0+x1+x2+x3+x4+x5+x6+x7, 0, 0, 0, 0, 0, 0
+   *
+   *  segbits=1 the segments are 2-wide:
+   *
+   *      output_segment=0:
+   *      Result= x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, x0+x2+x4+x6, x1+x3+x5+x7, 0, 0, 0, 0
+   *
+   *  and so on up to segbits=3, which is just the original vector:
+   *  segbits=3
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+    self_type result(0);
 
-            lane ++;
-          }
-        }
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    int output_offset = output_segment * (1 << segbits);
 
-        return result;
-      }
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      camp::idx_t output_i = output_offset + (i & ((1 << segbits) - 1));
+      auto value           = getThis()->get(i) + result.get(output_i);
+      result.set(value, output_i);
+    }
 
+    return result;
+  }
 
+  RAJA_INLINE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
 
-      /*!
-       * Segmented dot product performs dot products
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
-       *
-       *
-       *  segbits=0 is equivalent to a vector multiply,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
-       *
-       *  segbits=1 sums neighboring pairs of products.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
-       *
-       *  and so on up to segbits=3, which is a full dot-product of x and y, and the
-       *      output_segment denotes the vector position of the result
-       *
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type segmented_dot(camp::idx_t segbits, camp::idx_t output_segment, self_type const &x) const
-      {
-        return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
-      }
+    camp::idx_t num_segments = self_type::s_num_elem >> segbits;
+    camp::idx_t seg_size     = 1 << segbits;
 
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      input_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      input_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      input_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      input_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
+    camp::idx_t lane = 0;
+    for (camp::idx_t seg = 0; seg < num_segments; ++seg)
+    {
+      for (camp::idx_t i = 0; i < seg_size; ++i)
       {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+        if (seg >= num_outer || i >= num_inner)
+        {
+          result.set(element_type(0), lane);
+        }
+        else
+        {
 
-          auto off = (i&mask) + offset;
+          element_type div = getThis()->get(lane) / den.get(lane);
 
-          result.set(getThis()->get(off), i);
+          result.set(div, lane);
         }
 
-        return result;
+        lane++;
       }
+    }
 
+    return result;
+  }
 
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
+  /*!
+   * Segmented dot product performs dot products
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *                      Y = y0, y1, y2, y3, y4, y5, y6, y7
+   *
+   *
+   *  segbits=0 is equivalent to a vector multiply,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0*y0, x1*y1, x2*y2, x3*y3, x4*y4, x5*y5, x6*y6, x7*y7
+   *
+   *  segbits=1 sums neighboring pairs of products.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0*y0+x1*y1, x2*y2+x3*y3, x4*y4+x5*y5, x6*y6+x7*y7
+   *
+   *  and so on up to segbits=3, which is a full dot-product of x and y, and the
+   *      output_segment denotes the vector position of the result
+   *
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type segmented_dot(camp::idx_t segbits,
+                          camp::idx_t output_segment,
+                          self_type const& x) const
+  {
+    return getThis()->multiply(x).segmented_sum_inner(segbits, output_segment);
+  }
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      input_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      input_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      input_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      input_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
 
-        // default implementation is dumb, just sum each value into
-        // appropriate segment lane
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
-          auto off = (i>>segbits) + offset;
+      auto off = (i & mask) + offset;
 
-          result.set(getThis()->get(off), i);
-        }
+      result.set(getThis()->get(off), i);
+    }
 
-        return result;
-      }
+    return result;
+  }
 
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
 
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
 
+    // default implementation is dumb, just sum each value into
+    // appropriate segment lane
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
 
+      auto off = (i >> segbits) + offset;
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
+      result.set(getThis()->get(off), i);
+    }
 
-        //
-        for(camp::idx_t i = 0;i < self_type::s_num_elem; ++ i){
-          s += std::to_string(getThis()->get(i)) + " ";
-        }
+    return result;
+  }
 
-        s += " ]\n";
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Register(" + std::to_string(self_type::s_num_elem) + ")[ ";
 
-        return s;
-      }
+    //
+    for (camp::idx_t i = 0; i < self_type::s_num_elem; ++i)
+    {
+      s += std::to_string(getThis()->get(i)) + " ";
+    }
 
-  };
+    s += " ]\n";
 
+    return s;
+  }
+};
 
-} // namespace expt
-} // namespace internal
-} // namespace RAJA
 
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
index bb53993fed..cdef3ee1c2 100644
--- a/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorIndexTraits.hpp
@@ -27,347 +27,292 @@ namespace RAJA
 
 namespace internal
 {
-    /* Partial specialization for the strip_index_type_t helper in
-       IndexValue.hpp
-    */
-    template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
-    struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
-    {
-        using type = typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
-    };
-
+/* Partial specialization for the strip_index_type_t helper in
+   IndexValue.hpp
+*/
+template<typename IDX, typename VECTOR_TYPE, camp::idx_t DIM>
+struct StripIndexTypeT<RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>>
+{
+  using type =
+      typename RAJA::expt::TensorIndex<IDX, VECTOR_TYPE, DIM>::value_type;
+};
 
 namespace expt
 {
 
 
+// Helper that strips the Vector type from an argument
+template<typename ARG>
+struct TensorIndexTraits
+{
+  using arg_type   = ARG;
+  using value_type = strip_index_type_t<ARG>;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return false; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(arg_type const& arg) { return arg; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(arg_type const arg)
+  {
+    return arg;
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(arg_type const&) { return 1; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(arg_type const&) { return 0; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return 0; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem() { return 1; }
+};
+
+template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
+struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>>
+{
+  using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const& strip(index_type const& arg) { return *arg; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const arg)
+  {
+    return (arg_type)arg;
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const& arg) { return arg.size(); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const& arg)
+  {
+    return arg.begin();
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+template<typename IDX,
+         typename TENSOR_TYPE,
+         camp::idx_t DIM,
+         IDX INDEX_VALUE,
+         strip_index_type_t<IDX> LENGTH_VALUE>
+struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
+    RAJA::expt::StaticTensorIndexInner<IDX,
+                                       TENSOR_TYPE,
+                                       DIM,
+                                       INDEX_VALUE,
+                                       LENGTH_VALUE>>>
+{
+  using base_type  = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
+  using index_type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::StaticTensorIndexInner<IDX,
+                                         TENSOR_TYPE,
+                                         DIM,
+                                         INDEX_VALUE,
+                                         LENGTH_VALUE>>;
+  using arg_type   = IDX;
+  using value_type = strip_index_type_t<IDX>;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr bool isTensorIndex() { return true; }
 
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static constexpr arg_type const strip_by_value(index_type const)
+  {
+    return INDEX_VALUE;
+  }
 
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type size(index_type const&) { return LENGTH_VALUE; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type begin(index_type const&) { return INDEX_VALUE; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type dim() { return DIM; }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr value_type num_elem()
+  {
+    return TENSOR_TYPE::s_dim_elem(DIM);
+  }
+};
+
+/*
+ * Returns vector size of argument.
+ *
+ * For scalars, always returns 1.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template<typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr bool isTensorIndex()
+{
+  return TensorIndexTraits<ARG>::isTensorIndex();
+}
+
+template<typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndex(ARG const& arg) ->
+    typename TensorIndexTraits<ARG>::arg_type const&
+{
+  return TensorIndexTraits<ARG>::strip(arg);
+}
+
+template<typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto stripTensorIndexByValue(
+    ARG const arg) -> typename TensorIndexTraits<ARG>::arg_type const
+{
+  return TensorIndexTraits<ARG>::strip_by_value(arg);
+}
+
+/*
+ * Returns tensor dimension size of argument.
+ *
+ * For VectorIndex types, returns the number of vector lanes.
+ */
+template<typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorSize(ARG const& arg,
+                                                         IDX dim_size)
+{
+  return TensorIndexTraits<ARG>::size(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::size(arg))
+             : dim_size;
+}
+
+/*
+ * Returns tensor dimenson beginning index of an argument.
+ *
+ */
+template<typename ARG, typename IDX>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr IDX getTensorBegin(ARG const& arg,
+                                                          IDX dim_minval)
+{
+  return TensorIndexTraits<ARG>::begin(arg) >= 0
+             ? IDX(TensorIndexTraits<ARG>::begin(arg))
+             : dim_minval;
+}
+
+/*
+ * Returns vector dim of argument.
+ *
+ * For scalars, always returns 0.
+ *
+ * For VectorIndex types, returns the DIM argument.
+ * For vector_exec, this is always 0
+ *
+ * For matrices, DIM means:
+ *   0 : Row
+ *   1 : Column
+ */
+template<typename ARG>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto getTensorDim()
+    -> decltype(TensorIndexTraits<ARG>::dim())
+{
+  return TensorIndexTraits<ARG>::dim();
+}
+
+}  // namespace expt
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
+
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        camp::get<id>(data.segment_tuple)
+            .begin()[camp::get<id>(data.offset_tuple)],
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+/*
+ * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
+ * includes the vector length with them
+ */
+template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
+struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
+{
 
-    // Helper that strips the Vector type from an argument
-    template<typename ARG>
-    struct TensorIndexTraits {
-        using arg_type = ARG;
-        using value_type = strip_index_type_t<ARG>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return false;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(arg_type const &arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(arg_type const arg){
-          return arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(arg_type const &){
-          return 1;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(arg_type const &){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return 0;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return 1;
-        }
-    };
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM>
-    struct TensorIndexTraits<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>> {
-        using index_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const &strip(index_type const &arg){
-          return *arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const arg){
-          return (arg_type)arg;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &arg){
-          return arg.size();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &arg){
-          return arg.begin();
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-
-
-
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, IDX INDEX_VALUE, strip_index_type_t<IDX> LENGTH_VALUE>
-    struct TensorIndexTraits<RAJA::expt::StaticTensorIndex<
-        RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>
-    >> {
-        using base_type = RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>;
-        using index_type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<IDX, TENSOR_TYPE, DIM, INDEX_VALUE, LENGTH_VALUE>>;
-        using arg_type = IDX;
-        using value_type = strip_index_type_t<IDX>;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        bool isTensorIndex(){
-          return true;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        arg_type const strip_by_value(index_type const){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type size(index_type const &){
-          return LENGTH_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type begin(index_type const &){
-          return INDEX_VALUE;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type dim(){
-          return DIM;
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static
-        constexpr
-        value_type num_elem(){
-          return TENSOR_TYPE::s_dim_elem(DIM);
-        }
-    };
-
-    /*
-     * Returns vector size of argument.
-     *
-     * For scalars, always returns 1.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    bool isTensorIndex()
-    {
-      return TensorIndexTraits<ARG>::isTensorIndex();
-    }
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndex(ARG const &arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const &
-    {
-      return TensorIndexTraits<ARG>::strip(arg);
-    }
-
-
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto stripTensorIndexByValue(ARG const arg) ->
-    typename TensorIndexTraits<ARG>::arg_type const
-    {
-      return TensorIndexTraits<ARG>::strip_by_value(arg);
-    }
-
-    /*
-     * Returns tensor dimension size of argument.
-     *
-     * For VectorIndex types, returns the number of vector lanes.
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorSize(ARG const &arg, IDX dim_size)
-    {
-      return TensorIndexTraits<ARG>::size(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::size(arg)) :
-          dim_size;
-    }
-
-    /*
-     * Returns tensor dimenson beginning index of an argument.
-     *
-     */
-    template<typename ARG, typename IDX>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    IDX getTensorBegin(ARG const &arg, IDX dim_minval)
-    {
-      return TensorIndexTraits<ARG>::begin(arg) >= 0 ?
-          IDX(TensorIndexTraits<ARG>::begin(arg)) :
-          dim_minval;
-    }
-
-    /*
-     * Returns vector dim of argument.
-     *
-     * For scalars, always returns 0.
-     *
-     * For VectorIndex types, returns the DIM argument.
-     * For vector_exec, this is always 0
-     *
-     * For matrices, DIM means:
-     *   0 : Row
-     *   1 : Column
-     */
-    template<typename ARG>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto getTensorDim() ->
-      decltype(TensorIndexTraits<ARG>::dim())
-    {
-      return TensorIndexTraits<ARG>::dim();
-    }
-
-} // namespace expt
-
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaSegExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            camp::get<id>(data.segment_tuple).begin()[camp::get<id>(data.offset_tuple)],
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-    /*
-     * Lambda<N, Seg<X>>  overload that matches VectorIndex types, and properly
-     * includes the vector length with them
-     */
-    template<typename IDX, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t id>
-    struct LambdaOffsetExtractor<RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>, id>
-    {
-
-      template<typename Data>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      static RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM> extract(Data &&data)
-      {
-        return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
-            IDX(camp::get<id>(data.offset_tuple)), // convert offset type to IDX
-            camp::get<id>(data.vector_sizes));
-      }
-
-    };
-
-} // namespace internal
+  template<typename Data>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr static RAJA::expt::
+      TensorIndex<IDX, TENSOR_TYPE, DIM>
+      extract(Data&& data)
+  {
+    return RAJA::expt::TensorIndex<IDX, TENSOR_TYPE, DIM>(
+        IDX(camp::get<id>(data.offset_tuple)),  // convert offset type to IDX
+        camp::get<id>(data.vector_sizes));
+  }
+};
+
+}  // namespace internal
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRef.hpp b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
index 60e31f24b9..f8cf65cd0e 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRef.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRef.hpp
@@ -22,7 +22,6 @@
 
 #include "RAJA/util/macros.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -30,656 +29,750 @@ namespace internal
 namespace expt
 {
 
-    template<typename INT_SEQ>
-    struct StaticIndexArray;
-
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
-    struct PrependStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct AddStaticIndexArray;
-
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY >
-    struct SetStaticIndexArray;
-
-
-    template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>> {
-        
-        using seq_type = camp::int_seq<INDEX_TYPE,HEAD,TAIL...>;
-        using Self = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Tail = StaticIndexArray<camp::int_seq<INDEX_TYPE,TAIL...>>;
-
-        Tail tail;
-
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>() = default;
-       
-	 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t index) {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return Tail::value_at(index-1);
-            }
-        }
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t index) const {
-            if(index == 0){
-                return HEAD;
-            } else {
-                return tail[index-1];
-            }
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {
-            printf("%ld ",(long)HEAD);
-            tail.print_values();
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            printf("[");
-            print_values();
-            printf("]");
-        }
-
-
-    };
-
-    template<typename INDEX_TYPE>
-    struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
-    {
-
-        using seq_type = camp::int_seq<INDEX_TYPE>;
+template<typename INT_SEQ>
+struct StaticIndexArray;
 
-        RAJA_INLINE
-        StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
+template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, typename ARRAY>
+struct PrependStaticIndexArray;
 
+template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct AddStaticIndexArray;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        static constexpr INDEX_TYPE value_at(size_t) {
-            return 0;
-        }
+template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, typename ARRAY>
+struct SetStaticIndexArray;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        constexpr INDEX_TYPE operator[](size_t) const {
-            return 0;
-        }
+template<typename INDEX_TYPE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>
+{
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print_values() const {}
+  using seq_type = camp::int_seq<INDEX_TYPE, HEAD, TAIL...>;
+  using Self     = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Tail     = StaticIndexArray<camp::int_seq<INDEX_TYPE, TAIL...>>;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-            print("[]");
-        }
+  Tail tail;
 
-    };
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>() = default;
 
-    template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
-    struct PrependStaticIndexArray<INDEX_TYPE, NEW_HEAD, StaticIndexArray<camp::int_seq<INDEX_TYPE,ORIG_INTS...>>>
-    {
-        using Type = StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
-        using Seq  = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
-    };
 
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t index)
+  {
+    if (index == 0)
+    {
+      return HEAD;
+    }
+    else
+    {
+      return Tail::value_at(index - 1);
+    }
+  }
 
+  RAJA_HOST_DEVICE
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, IDX, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t index) const
+  {
+    if (index == 0)
     {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using AddTail = typename AddStaticIndexArray<INDEX_TYPE,IDX-1,DELTA,typename Orig::Tail>::Type;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,AddTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE DELTA, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct AddStaticIndexArray<INDEX_TYPE, 0, DELTA, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
+      return HEAD;
+    }
+    else
     {
+      return tail[index - 1];
+    }
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  void print_values() const
+  {
+    printf("%ld ", (long)HEAD);
+    tail.print_values();
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  void print() const
+  {
+    printf("[");
+    print_values();
+    printf("]");
+  }
+};
+
+template<typename INDEX_TYPE>
+struct StaticIndexArray<camp::int_seq<INDEX_TYPE>>
+{
 
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,HEAD+DELTA,typename Orig::Tail>::Seq;
-    };
+  using seq_type = camp::int_seq<INDEX_TYPE>;
 
+  RAJA_INLINE
+  StaticIndexArray<camp::int_seq<INDEX_TYPE>>() = default;
 
 
-    template<typename INDEX_TYPE, size_t IDX, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>> 
-    {
-        using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using SetTail = typename SetStaticIndexArray<INDEX_TYPE,IDX-1,VALUE,typename Orig::Tail>::Type;
-        using Type    = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Type;
-        using Seq     = typename PrependStaticIndexArray<INDEX_TYPE,HEAD,SetTail>::Seq;
-    };
-
-    template<typename INDEX_TYPE, INDEX_TYPE VALUE, INDEX_TYPE HEAD, INDEX_TYPE... TAIL>
-    struct SetStaticIndexArray<INDEX_TYPE, 0, VALUE, StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>>
-    {
-        using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE,HEAD,TAIL...>>;
-        using Type = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Type;
-        using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,VALUE,typename Orig::Tail>::Seq;
-    };
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  static constexpr INDEX_TYPE value_at(size_t) { return 0; }
 
-    enum TensorTileSize
-    {
-      TENSOR_PARTIAL,  // the tile is a full TensorRegister
-      TENSOR_FULL,     // the tile is a partial TensorRegister
-      TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
-    };
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr INDEX_TYPE operator[](size_t) const { return 0; }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  void print_values() const {}
 
-    template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
-    struct TensorTile
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  void print() const { print("[]"); }
+};
+
+template<typename INDEX_TYPE, INDEX_TYPE NEW_HEAD, INDEX_TYPE... ORIG_INTS>
+struct PrependStaticIndexArray<
+    INDEX_TYPE,
+    NEW_HEAD,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, ORIG_INTS...>>>
+{
+  using Type =
+      StaticIndexArray<camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>>;
+  using Seq = camp::int_seq<INDEX_TYPE, NEW_HEAD, ORIG_INTS...>;
+};
+
+template<typename INDEX_TYPE,
+         size_t IDX,
+         INDEX_TYPE DELTA,
+         INDEX_TYPE HEAD,
+         INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using AddTail = typename AddStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               DELTA,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, AddTail>::Seq;
+};
+
+template<typename INDEX_TYPE,
+         INDEX_TYPE DELTA,
+         INDEX_TYPE HEAD,
+         INDEX_TYPE... TAIL>
+struct AddStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    DELTA,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                HEAD + DELTA,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               HEAD + DELTA,
+                                               typename Orig::Tail>::Seq;
+};
+
+template<typename INDEX_TYPE,
+         size_t IDX,
+         INDEX_TYPE VALUE,
+         INDEX_TYPE HEAD,
+         INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    IDX,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig    = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using SetTail = typename SetStaticIndexArray<INDEX_TYPE,
+                                               IDX - 1,
+                                               VALUE,
+                                               typename Orig::Tail>::Type;
+  using Type =
+      typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Type;
+  using Seq = typename PrependStaticIndexArray<INDEX_TYPE, HEAD, SetTail>::Seq;
+};
+
+template<typename INDEX_TYPE,
+         INDEX_TYPE VALUE,
+         INDEX_TYPE HEAD,
+         INDEX_TYPE... TAIL>
+struct SetStaticIndexArray<
+    INDEX_TYPE,
+    0,
+    VALUE,
+    StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>>
+{
+  using Orig = StaticIndexArray<camp::int_seq<INDEX_TYPE, HEAD, TAIL...>>;
+  using Type = typename PrependStaticIndexArray<INDEX_TYPE,
+                                                VALUE,
+                                                typename Orig::Tail>::Type;
+  using Seq  = typename PrependStaticIndexArray<INDEX_TYPE,
+                                               VALUE,
+                                               typename Orig::Tail>::Seq;
+};
+
+enum TensorTileSize
+{
+  TENSOR_PARTIAL,  // the tile is a full TensorRegister
+  TENSOR_FULL,     // the tile is a partial TensorRegister
+  TENSOR_MULTIPLE  // the tile is multiple TennsorRegisters
+};
+
+template<typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS>
+struct TensorTile
+{
+  using self_type           = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using nonstatic_self_type = self_type;
+  using index_type          = INDEX_TYPE;
+  index_type m_begin[NUM_DIMS];
+  index_type m_size[NUM_DIMS];
+
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+
+  template<typename I, TensorTileSize S>
+  void copy(TensorTile<I, S, NUM_DIMS> const& c)
+  {
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
     {
-        using self_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using nonstatic_self_type = self_type;
-        using index_type = INDEX_TYPE;
-        index_type m_begin[NUM_DIMS];
-        index_type m_size[NUM_DIMS];
-
-        static constexpr camp::idx_t s_num_dims = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
-
-
-        template<typename I, TensorTileSize S>
-        void copy(TensorTile<I, S, NUM_DIMS> const &c)
-        {
-          for(camp::idx_t i = 0;i < NUM_DIMS;++i){
-            m_begin[i] = c.m_begin[i];
-            m_size[i] = c.m_size[i];
-          }
-        }
-
-        /*!
-         * Subtract begin offsets of two tiles.
-         *
-         * The resulting tile has the sizes of the left operand, but has
-         * m_begin[i] = left.m_begin[i] - right.m_begin[i]
-         *
-         */
-        template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        self_type operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const &sub) const {
-          self_type result(*this);
-          for(camp::idx_t i = 0;i < s_num_dims; ++ i){
-            result.m_begin[i] -= sub.m_begin[i];
-          }
-          return result;
-        }
-
-
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorTile: dims=%d, m_begin=[",  (int)NUM_DIMS);
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_begin[i]);
-          }
-
-          printf("], m_size=[");
-
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_size[i]);
-          }
-
-          printf("]\n");
-        }
-    };
-
-
-
-
-    template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    struct StaticTensorTile;
-
-    template< typename INDEX_TYPE,
-              TensorTileSize TENSOR_SIZE,
-              INDEX_TYPE... BeginInts,
-              INDEX_TYPE... SizeInts>
-    struct StaticTensorTile <
-              INDEX_TYPE,
-              TENSOR_SIZE,
-              camp::int_seq<INDEX_TYPE, BeginInts...>,
-              camp::int_seq<INDEX_TYPE, SizeInts...>>
+      m_begin[i] = c.m_begin[i];
+      m_size[i]  = c.m_size[i];
+    }
+  }
+
+  /*!
+   * Subtract begin offsets of two tiles.
+   *
+   * The resulting tile has the sizes of the left operand, but has
+   * m_begin[i] = left.m_begin[i] - right.m_begin[i]
+   *
+   */
+  template<typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type
+  operator-(TensorTile<INDEX_TYPE2, TENSOR_SIZE2, NUM_DIMS> const& sub) const
+  {
+    self_type result(*this);
+    for (camp::idx_t i = 0; i < s_num_dims; ++i)
     {
+      result.m_begin[i] -= sub.m_begin[i];
+    }
+    return result;
+  }
 
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorTile: dims=%d, m_begin=[", (int)NUM_DIMS);
 
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
-        using begin_type = StaticIndexArray<begin_seq>;
-        using size_type  = StaticIndexArray<size_seq >;
-        using self_type  = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq,size_seq>;
-        using index_type = INDEX_TYPE;
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_begin[i]);
+    }
 
-        using nonstatic_self_type = TensorTile<INDEX_TYPE,TENSOR_SIZE,sizeof...(BeginInts)>;
+    printf("], m_size=[");
 
-        using Partial = StaticTensorTile< INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>; 
-        using Full    = StaticTensorTile< INDEX_TYPE, TENSOR_FULL   , begin_seq, size_seq>; 
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_size[i]);
+    }
 
-        begin_type m_begin;
-        size_type  m_size;
+    printf("]\n");
+  }
+};
+
+
+template<typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE>
+struct StaticTensorTile;
+
+template<typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         INDEX_TYPE... BeginInts,
+         INDEX_TYPE... SizeInts>
+struct StaticTensorTile<INDEX_TYPE,
+                        TENSOR_SIZE,
+                        camp::int_seq<INDEX_TYPE, BeginInts...>,
+                        camp::int_seq<INDEX_TYPE, SizeInts...>>
+{
 
-	static_assert(
-          sizeof...(BeginInts) == sizeof...(SizeInts),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorTile"
-        );
 
-        static constexpr camp::idx_t s_num_dims = sizeof...(BeginInts);
-        static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
+  using begin_type = StaticIndexArray<begin_seq>;
+  using size_type  = StaticIndexArray<size_seq>;
+  using self_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
+  using index_type = INDEX_TYPE;
 
-        constexpr operator nonstatic_self_type() const {
-            return nonstatic_self_type { {BeginInts...}, {SizeInts...} };
-        }
+  using nonstatic_self_type =
+      TensorTile<INDEX_TYPE, TENSOR_SIZE, sizeof...(BeginInts)>;
 
-        constexpr nonstatic_self_type nonstatic() const {
-            return *this;
-        }
-        
-        template<TensorTileSize S>
-        constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const RAJA_UNUSED_ARG(&c)) const
-        {}
+  using Partial =
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, begin_seq, size_seq>;
+  using Full = StaticTensorTile<INDEX_TYPE, TENSOR_FULL, begin_seq, size_seq>;
 
+  begin_type m_begin;
+  size_type m_size;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorTile: dims=%d, m_begin=",  (int)s_num_dims);
+  static_assert(sizeof...(BeginInts) == sizeof...(SizeInts),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorTile");
 
-          m_begin.print();
+  static constexpr camp::idx_t s_num_dims       = sizeof...(BeginInts);
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-          printf(", m_size=");
-          
-          m_size.print();
+  constexpr operator nonstatic_self_type() const
+  {
+    return nonstatic_self_type {{BeginInts...}, {SizeInts...}};
+  }
 
-          printf("\n");
-        }
-    };
+  constexpr nonstatic_self_type nonstatic() const { return *this; }
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileBegin;
+  template<TensorTileSize S>
+  constexpr void copy(StaticTensorTile<INDEX_TYPE, S, begin_seq, size_seq> const
+                          RAJA_UNUSED_ARG(&c)) const
+  {}
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileBegin<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using BeginType = StaticIndexArray<TBEGIN>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,BeginType>::Seq,
-                TSIZE
-            >;
-        };
+  RAJA_HOST_DEVICE
 
-        template< typename TILE, typename VALUE, size_t IDX>
-        struct SetStaticTensorTileSize;
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorTile: dims=%d, m_begin=", (int)s_num_dims);
 
-        template< typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, INDEX_TYPE VALUE, size_t IDX > 
-        struct SetStaticTensorTileSize<
-              StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE >,
-              camp::integral_constant<INDEX_TYPE,VALUE>,
-              IDX
-        > {
-            using SizeType = StaticIndexArray<TSIZE>;
-            using Type = StaticTensorTile<
-                INDEX_TYPE,
-                TENSOR_SIZE,
-                TBEGIN,
-                typename SetStaticIndexArray<INDEX_TYPE,IDX,VALUE,SizeType>::Seq
-            >;
-        };
+    m_begin.print();
 
+    printf(", m_size=");
 
+    m_size.print();
 
+    printf("\n");
+  }
+};
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, camp::idx_t NUM_DIMS, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct TensorRef
-    {
-        static constexpr camp::idx_t    s_stride_one_dim = STRIDE_ONE_DIM;
-        static constexpr camp::idx_t    s_num_dims       = NUM_DIMS;
-        static constexpr TensorTileSize s_tensor_size    = TENSOR_SIZE;
+template<typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileBegin;
 
-        using self_type = TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, NUM_DIMS, STRIDE_ONE_DIM>;
-        using tile_type = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
+template<typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE,
+         INDEX_TYPE VALUE,
+         size_t IDX>
+struct SetStaticTensorTileBegin<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using BeginType = StaticIndexArray<TBEGIN>;
+  using Type      = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, BeginType>::Seq,
+      TSIZE>;
+};
+
+template<typename TILE, typename VALUE, size_t IDX>
+struct SetStaticTensorTileSize;
+
+template<typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE,
+         INDEX_TYPE VALUE,
+         size_t IDX>
+struct SetStaticTensorTileSize<
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>,
+    camp::integral_constant<INDEX_TYPE, VALUE>,
+    IDX>
+{
+  using SizeType = StaticIndexArray<TSIZE>;
+  using Type     = StaticTensorTile<
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      TBEGIN,
+      typename SetStaticIndexArray<INDEX_TYPE, IDX, VALUE, SizeType>::Seq>;
+};
+
+template<typename POINTER_TYPE,
+         typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         camp::idx_t NUM_DIMS,
+         camp::idx_t STRIDE_ONE_DIM = -1>
+struct TensorRef
+{
+  static constexpr camp::idx_t s_stride_one_dim = STRIDE_ONE_DIM;
+  static constexpr camp::idx_t s_num_dims       = NUM_DIMS;
+  static constexpr TensorTileSize s_tensor_size = TENSOR_SIZE;
 
-        pointer_type m_pointer;
-        index_type m_stride[NUM_DIMS];
-        tile_type m_tile;
+  using self_type    = TensorRef<POINTER_TYPE,
+                              INDEX_TYPE,
+                              TENSOR_SIZE,
+                              NUM_DIMS,
+                              STRIDE_ONE_DIM>;
+  using tile_type    = TensorTile<INDEX_TYPE, TENSOR_SIZE, NUM_DIMS>;
+  using pointer_type = POINTER_TYPE;
+  using index_type   = INDEX_TYPE;
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS, m_pointer);
 
-          for(camp::idx_t i = 0;i < NUM_DIMS;++ i){
-            printf("%ld ", (long)m_stride[i]);
-          }
+  pointer_type m_pointer;
+  index_type m_stride[NUM_DIMS];
+  tile_type m_tile;
 
-          printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+  RAJA_HOST_DEVICE
 
-          m_tile.print();
-        }
+  RAJA_INLINE
+  void print() const
+  {
+    printf("TensorRef: dims=%d, m_pointer=%p, m_stride=[", (int)NUM_DIMS,
+           m_pointer);
 
-    };
+    for (camp::idx_t i = 0; i < NUM_DIMS; ++i)
+    {
+      printf("%ld ", (long)m_stride[i]);
+    }
 
+    printf("], stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
+
+    m_tile.print();
+  }
+};
+
+
+template<typename POINTER_TYPE,
+         typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         typename STRIDE_TYPE,
+         typename BEGIN_TYPE,
+         typename SIZE_TYPE,
+         camp::idx_t STRIDE_ONE_DIM = -1>
+struct StaticTensorRef;
+
+template<typename POINTER_TYPE,
+         typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         INDEX_TYPE... StrideInts,
+         INDEX_TYPE... BeginInts,
+         INDEX_TYPE... SizeInts,
+         camp::idx_t STRIDE_ONE_DIM>
+struct StaticTensorRef<POINTER_TYPE,
+                       INDEX_TYPE,
+                       TENSOR_SIZE,
+                       camp::int_seq<INDEX_TYPE, StrideInts...>,
+                       camp::int_seq<INDEX_TYPE, BeginInts...>,
+                       camp::int_seq<INDEX_TYPE, SizeInts...>,
+                       STRIDE_ONE_DIM>
+{
 
+  static constexpr camp::idx_t s_num_dims           = sizeof...(BeginInts);
+  static constexpr camp::idx_t s_stride_one_dim     = STRIDE_ONE_DIM;
+  static constexpr TensorTileSize s_ref_tensor_size = TENSOR_SIZE;
+  using pointer_type                                = POINTER_TYPE;
+  using index_type                                  = INDEX_TYPE;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename STRIDE_TYPE, typename BEGIN_TYPE, typename SIZE_TYPE, camp::idx_t STRIDE_ONE_DIM = -1>
-    struct StaticTensorRef;
+  using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
+  using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
+  using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts...>;
 
-    template<typename POINTER_TYPE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, INDEX_TYPE... StrideInts, INDEX_TYPE... BeginInts, INDEX_TYPE... SizeInts, camp::idx_t STRIDE_ONE_DIM>
-    struct StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,camp::int_seq<INDEX_TYPE,StrideInts...>,camp::int_seq<INDEX_TYPE,BeginInts...>,camp::int_seq<INDEX_TYPE,SizeInts...>,STRIDE_ONE_DIM>
-    {
+  using stride_type = StaticIndexArray<stride_seq>;
 
-        static constexpr camp::idx_t    s_num_dims         = sizeof...(BeginInts);
-        static constexpr camp::idx_t    s_stride_one_dim   = STRIDE_ONE_DIM;
-        static constexpr TensorTileSize s_ref_tensor_size  = TENSOR_SIZE;
-        using pointer_type = POINTER_TYPE;
-        using index_type = INDEX_TYPE;
-        
-        using stride_seq = camp::int_seq<INDEX_TYPE, StrideInts...>;
-        using begin_seq  = camp::int_seq<INDEX_TYPE, BeginInts...>;
-        using size_seq   = camp::int_seq<INDEX_TYPE, SizeInts... >;
+  static_assert((sizeof...(BeginInts) == sizeof...(SizeInts)) &&
+                    (sizeof...(SizeInts) == sizeof...(StrideInts)),
+                "Mismatch between number of elements in Begin and Size series "
+                "of StaticTensorRef");
 
-        using stride_type  = StaticIndexArray<stride_seq>;
 
-	static_assert(
-          (sizeof...(BeginInts) == sizeof...(SizeInts)) && (sizeof...(SizeInts) == sizeof...(StrideInts)),
-          "Mismatch between number of elements in Begin and Size series of StaticTensorRef"
-        );
-        
+  using self_type = StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE,
+                                    TENSOR_SIZE,
+                                    stride_seq,
+                                    begin_seq,
+                                    size_seq>;
+  using tile_type =
+      StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
-        using self_type = StaticTensorRef<POINTER_TYPE,INDEX_TYPE,TENSOR_SIZE,stride_seq,begin_seq,size_seq>;
-        using tile_type = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, begin_seq, size_seq>;
 
+  pointer_type m_pointer;
+  stride_type m_stride;
+  tile_type m_tile;
 
-        pointer_type m_pointer;
-        stride_type m_stride;
-        tile_type m_tile;
+  RAJA_HOST_DEVICE
 
-        RAJA_HOST_DEVICE
-        RAJA_INLINE
-        void print() const {
-          printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims, m_pointer);
-
-          m_stride.print();
-
-          printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
-
-          m_tile.print();
-        }
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
-    struct MergeRefTile;
-
-    template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t ... DIM_SEQ>
-    struct MergeRefTile <REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>> {
-
-        static_assert( REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims , "Merging a ref with a tile requires an equivalent number of dimensions.");
-
-        static constexpr camp::idx_t    s_num_dims         = REF_TYPE::s_num_dims;
-        static constexpr camp::idx_t    s_stride_one_dim   = REF_TYPE::s_stride_one_dim;
-        static constexpr TensorTileSize s_ref_tensor_size  = TILE_TYPE::s_tensor_size;
-        using pointer_type    = typename REF_TYPE::pointer_type;
-        using ref_index_type  = typename REF_TYPE::index_type;
-        
-        static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
-        using tile_index_type = typename TILE_TYPE::index_type;
-
-        using merge_type = TensorRef<pointer_type, tile_index_type, s_tile_tensor_size, s_num_dims, s_stride_one_dim>;
-        using shift_type = merge_type;
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(REF_TYPE const &ref, TILE_TYPE const &tile){
-          return merge_type{
-            ref.m_pointer,
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin){
-          return shift_type{
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            {tile_index_type(ref.m_stride[DIM_SEQ])...},
-            ref.m_tile
-          };
-        }
-
-    };
-
-
-
-
-
-
-
-    template<
-       typename POINTER_TYPE, typename INDEX_TYPE1, TensorTileSize RTENSOR_SIZE,
-       typename STRIDE, INDEX_TYPE1... BEGIN1, INDEX_TYPE1... SIZE1, camp::idx_t STRIDE_ONE_DIM,
-       typename INDEX_TYPE2, TensorTileSize TENSOR_SIZE, typename BEGIN2, typename SIZE2,
-       camp::idx_t ... DIM_SEQ
-    >
-    struct MergeRefTile<
-       StaticTensorRef<
-              POINTER_TYPE, INDEX_TYPE1, RTENSOR_SIZE,
-              STRIDE,
-              camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-              camp::int_seq<INDEX_TYPE1,SIZE1...>,
-              STRIDE_ONE_DIM
-       >,
-       StaticTensorTile<
-              INDEX_TYPE2,
-              TENSOR_SIZE,
-              BEGIN2,
-              SIZE2
-       >,
-       camp::idx_seq<DIM_SEQ...>
-    > {
-
-        using ref_tile_type = StaticTensorTile<
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>
-              >;
-
-        using ref_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE1,
-                  RTENSOR_SIZE,
-                  STRIDE,
-                  camp::int_seq<INDEX_TYPE1,BEGIN1...>,
-                  camp::int_seq<INDEX_TYPE1, SIZE1...>,
-                  STRIDE_ONE_DIM
-              >;
-
-        using tile_type = StaticTensorTile<
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  BEGIN2,
-                  SIZE2
-              >;
-
-        using ref_stride_type = typename ref_type ::stride_type;
-
-        using new_stride_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>; 
-        
-        using shift_begin_seq = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(BEGIN1)...>; 
-        using shift_size_seq  = camp::int_seq<INDEX_TYPE2,INDEX_TYPE2(SIZE1)...>; 
-       
-        using shift_tile_type = StaticTensorTile<INDEX_TYPE2,TENSOR_SIZE,shift_begin_seq,shift_size_seq>;
- 
-        using new_stride_type = StaticIndexArray<new_stride_seq>; 
-
-        using merge_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  BEGIN2,
-                  SIZE2,
-                  STRIDE_ONE_DIM
-              >;
-
-        using shift_type = StaticTensorRef<
-                  POINTER_TYPE,
-                  INDEX_TYPE2,
-                  TENSOR_SIZE,
-                  new_stride_seq,
-                  shift_begin_seq,
-                  shift_size_seq,
-                  STRIDE_ONE_DIM
-              >;
-
-
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        merge_type merge(ref_type const &ref, tile_type const &tile){
-          return merge_type {
-            ref.m_pointer,
-            new_stride_type(),
-            tile
-          };
-        }
-
-        RAJA_INLINE
-        RAJA_HOST_DEVICE
-        static constexpr
-        shift_type shift_origin(ref_type const &ref, tile_type const &tile_origin){
-          return shift_type {
-            ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ]*ref.m_stride[DIM_SEQ]) ...),
-            new_stride_type(),
-            shift_tile_type()
-          };
-        }
-
-
-
-    };
-
-
-
-
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto merge_ref_tile(REF_TYPE const &ref, TILE_TYPE const &tile) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref, tile);
-    }
+  RAJA_INLINE
+  void print() const
+  {
+    printf("StaticTensorRef: dims=%d, m_pointer=%p, m_stride=", (int)s_num_dims,
+           m_pointer);
 
+    m_stride.print();
 
+    printf(", stride_one_dim=%d\n", (int)STRIDE_ONE_DIM);
 
-    /*!
-     * Modifies a ref's pointer so that the supplied tile_origin will resolve
-     * to the original pointer.
-     */
-    template<typename REF_TYPE, typename TILE_TYPE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    auto shift_tile_origin(REF_TYPE const &ref, TILE_TYPE const &tile_origin) ->
-      typename MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
-    {
-      return MergeRefTile<REF_TYPE, TILE_TYPE, camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref, tile_origin);
-    }
+    m_tile.print();
+  }
+};
 
 
+template<typename REF_TYPE, typename TILE_TYPE, typename DIM_SEQ>
+struct MergeRefTile;
 
-    /*!
-     * Changes TensorTile size type to FULL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &
-    make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS> &>(tile);
-    }
+template<typename REF_TYPE, typename TILE_TYPE, camp::idx_t... DIM_SEQ>
+struct MergeRefTile<REF_TYPE, TILE_TYPE, camp::idx_seq<DIM_SEQ...>>
+{
 
-    /*!
-     * Changes TensorTile size type to PARTIAL
-     */
-    template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &
-    make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS> &tile){
-      return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS> &>(tile);
-    }
+  static_assert(
+      REF_TYPE::s_num_dims == TILE_TYPE::s_num_dims,
+      "Merging a ref with a tile requires an equivalent number of dimensions.");
+
+  static constexpr camp::idx_t s_num_dims       = REF_TYPE::s_num_dims;
+  static constexpr camp::idx_t s_stride_one_dim = REF_TYPE::s_stride_one_dim;
+  static constexpr TensorTileSize s_ref_tensor_size = TILE_TYPE::s_tensor_size;
+  using pointer_type   = typename REF_TYPE::pointer_type;
+  using ref_index_type = typename REF_TYPE::index_type;
+
+  static constexpr TensorTileSize s_tile_tensor_size = TILE_TYPE::s_tensor_size;
+  using tile_index_type = typename TILE_TYPE::index_type;
+
+  using merge_type = TensorRef<pointer_type,
+                               tile_index_type,
+                               s_tile_tensor_size,
+                               s_num_dims,
+                               s_stride_one_dim>;
+  using shift_type = merge_type;
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(REF_TYPE const& ref, TILE_TYPE const& tile)
+  {
+    return merge_type {
+        ref.m_pointer, {tile_index_type(ref.m_stride[DIM_SEQ])...}, tile};
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(REF_TYPE const& ref,
+                                           TILE_TYPE const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        {tile_index_type(ref.m_stride[DIM_SEQ])...},
+        ref.m_tile};
+  }
+};
+
+template<typename POINTER_TYPE,
+         typename INDEX_TYPE1,
+         TensorTileSize RTENSOR_SIZE,
+         typename STRIDE,
+         INDEX_TYPE1... BEGIN1,
+         INDEX_TYPE1... SIZE1,
+         camp::idx_t STRIDE_ONE_DIM,
+         typename INDEX_TYPE2,
+         TensorTileSize TENSOR_SIZE,
+         typename BEGIN2,
+         typename SIZE2,
+         camp::idx_t... DIM_SEQ>
+struct MergeRefTile<StaticTensorRef<POINTER_TYPE,
+                                    INDEX_TYPE1,
+                                    RTENSOR_SIZE,
+                                    STRIDE,
+                                    camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                    camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                    STRIDE_ONE_DIM>,
+                    StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>,
+                    camp::idx_seq<DIM_SEQ...>>
+{
 
+  using ref_tile_type = StaticTensorTile<INDEX_TYPE1,
+                                         RTENSOR_SIZE,
+                                         camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                         camp::int_seq<INDEX_TYPE1, SIZE1...>>;
+
+  using ref_type = StaticTensorRef<POINTER_TYPE,
+                                   INDEX_TYPE1,
+                                   RTENSOR_SIZE,
+                                   STRIDE,
+                                   camp::int_seq<INDEX_TYPE1, BEGIN1...>,
+                                   camp::int_seq<INDEX_TYPE1, SIZE1...>,
+                                   STRIDE_ONE_DIM>;
+
+  using tile_type = StaticTensorTile<INDEX_TYPE2, TENSOR_SIZE, BEGIN2, SIZE2>;
+
+  using ref_stride_type = typename ref_type ::stride_type;
+
+  using new_stride_seq =
+      camp::int_seq<INDEX_TYPE2,
+                    INDEX_TYPE2(ref_stride_type::value_at(DIM_SEQ))...>;
+
+  using shift_begin_seq = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(BEGIN1)...>;
+  using shift_size_seq  = camp::int_seq<INDEX_TYPE2, INDEX_TYPE2(SIZE1)...>;
+
+  using shift_tile_type = StaticTensorTile<INDEX_TYPE2,
+                                           TENSOR_SIZE,
+                                           shift_begin_seq,
+                                           shift_size_seq>;
+
+  using new_stride_type = StaticIndexArray<new_stride_seq>;
+
+  using merge_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     BEGIN2,
+                                     SIZE2,
+                                     STRIDE_ONE_DIM>;
+
+  using shift_type = StaticTensorRef<POINTER_TYPE,
+                                     INDEX_TYPE2,
+                                     TENSOR_SIZE,
+                                     new_stride_seq,
+                                     shift_begin_seq,
+                                     shift_size_seq,
+                                     STRIDE_ONE_DIM>;
+
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr merge_type merge(ref_type const& ref, tile_type const& tile)
+  {
+    return merge_type {ref.m_pointer, new_stride_type(), tile};
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  static constexpr shift_type shift_origin(ref_type const& ref,
+                                           tile_type const& tile_origin)
+  {
+    return shift_type {
+        ref.m_pointer - RAJA::sum<camp::idx_t>((tile_origin.m_begin[DIM_SEQ] *
+                                                ref.m_stride[DIM_SEQ])...),
+        new_stride_type(), shift_tile_type()};
+  }
+};
+
+template<typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto merge_ref_tile(
+    REF_TYPE const& ref,
+    TILE_TYPE const& tile) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge_type
+{
+  return MergeRefTile<REF_TYPE, TILE_TYPE,
+                      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::merge(ref,
+                                                                          tile);
+}
 
+/*!
+ * Modifies a ref's pointer so that the supplied tile_origin will resolve
+ * to the original pointer.
+ */
+template<typename REF_TYPE, typename TILE_TYPE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr auto shift_tile_origin(
+    REF_TYPE const& ref,
+    TILE_TYPE const& tile_origin) ->
+    typename MergeRefTile<
+        REF_TYPE,
+        TILE_TYPE,
+        camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_type
+{
+  return MergeRefTile<
+      REF_TYPE, TILE_TYPE,
+      camp::make_idx_seq_t<TILE_TYPE::s_num_dims>>::shift_origin(ref,
+                                                                 tile_origin);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to FULL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &
-    make_tensor_tile_full(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to FULL
+ */
+template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE,
+                                                  TENSOR_FULL,
+                                                  NUM_DIMS>&
+make_tensor_tile_full(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_FULL, NUM_DIMS>&>(tile);
+}
 
-    /*!
-     * Changes StaticTensorTile size type to PARTIAL
-     */
-    template< typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, typename TBEGIN, typename TSIZE>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    constexpr
-    StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &
-    make_tensor_tile_partial(StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE> &tile){
-      return reinterpret_cast<StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE> &>(tile);
-    }
+/*!
+ * Changes TensorTile size type to PARTIAL
+ */
+template<typename INDEX_TYPE, TensorTileSize RTENSOR_SIZE, camp::idx_t NUM_DIMS>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr TensorTile<INDEX_TYPE,
+                                                  TENSOR_PARTIAL,
+                                                  NUM_DIMS>&
+make_tensor_tile_partial(TensorTile<INDEX_TYPE, RTENSOR_SIZE, NUM_DIMS>& tile)
+{
+  return reinterpret_cast<TensorTile<INDEX_TYPE, TENSOR_PARTIAL, NUM_DIMS>&>(
+      tile);
+}
+
+/*!
+ * Changes StaticTensorTile size type to FULL
+ */
+template<typename INDEX_TYPE,
+         TensorTileSize RTENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_FULL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_full(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_FULL, TBEGIN, TSIZE>&>(tile);
+}
 
+/*!
+ * Changes StaticTensorTile size type to PARTIAL
+ */
+template<typename INDEX_TYPE,
+         TensorTileSize RTENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr StaticTensorTile<INDEX_TYPE,
+                                                        TENSOR_PARTIAL,
+                                                        TBEGIN,
+                                                        TSIZE>&
+make_tensor_tile_partial(
+    StaticTensorTile<INDEX_TYPE, RTENSOR_SIZE, TBEGIN, TSIZE>& tile)
+{
+  return reinterpret_cast<
+      StaticTensorTile<INDEX_TYPE, TENSOR_PARTIAL, TBEGIN, TSIZE>&>(tile);
+}
 
 
-  } // namespace expt
-} // namespace internal
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
index d2bce598ff..07d515a7db 100644
--- a/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorRegisterBase.hpp
@@ -34,815 +34,820 @@ namespace expt
 {
 
 
+namespace ET
+{
+class TensorExpressionConcreteBase;
+}  // namespace ET
+
+template<typename TENSOR, camp::idx_t DIM>
+struct TensorDimSize
+{
+  static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
+};
+
+/*
+ * Tensor product helper class.
+ *
+ * This defines the default product operation between types when using the
+ * operator*
+ *
+ */
+template<typename LHS, typename RHS>
+struct TensorDefaultOperation
+{
+
+  using multiply_type = decltype(LHS().multiply(RHS()));
+
+  // default multiplication operator
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  static multiply_type multiply(LHS const& lhs, RHS const& rhs)
+  {
+    return lhs.multiply(rhs);
+  }
+};
+
+template<typename REF_TYPE>
+struct TensorRegisterStoreRef
+{
+  using self_type = TensorRegisterStoreRef<REF_TYPE>;
+  REF_TYPE m_ref;
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type operator=(RHS const& rhs)
+  {
 
+    rhs.store_ref(m_ref);
+    return *this;
+  }
+};
 
+template<camp::idx_t N, camp::idx_t D>
+struct DivideRoundUp
+{
+  static constexpr camp::idx_t value = (N % D) > 0 ? (1 + N / D) : (N / D);
+};
+
+class TensorRegisterConcreteBase
+{};
+
+/*!
+ * TensorRegister base class that provides some default behaviors and simplifies
+ * the implementation of new register types.
+ *
+ * This uses CRTP to provide static polymorphism
+ */
+template<typename Derived>
+class TensorRegisterBase;
+
+template<typename REGISTER_POLICY,
+         typename T,
+         typename LAYOUT,
+         typename camp::idx_t... SIZES>
+class TensorRegisterBase<
+    RAJA::expt::
+        TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>>
+    : public TensorRegisterConcreteBase
+{
+public:
+  using self_type = RAJA::expt::
+      TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
+  using element_type = camp::decay<T>;
+
+  static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
 
-  namespace ET
+  static constexpr camp::idx_t s_num_registers =
+      DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...),
+                    RegisterTraits<REGISTER_POLICY, T>::s_num_elem>::value;
+
+  using index_type = camp::idx_t;
+
+  using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
+
+  using register_policy = REGISTER_POLICY;
+
+private:
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type* getThis() { return static_cast<self_type*>(this); }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr self_type const* getThis() const
   {
-    class TensorExpressionConcreteBase;
-  } // namespace ET
+    return static_cast<self_type const*>(this);
+  }
+
+protected:
+  register_type m_registers[s_num_registers];
 
+public:
+  RAJA_HOST_DEVICE
 
-  template<typename TENSOR, camp::idx_t DIM>
-  struct TensorDimSize{
-      static constexpr camp::idx_t value = TENSOR::s_dim_size(DIM);
-  };
+  RAJA_INLINE
+  constexpr TensorRegisterBase() {}
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  TensorRegisterBase(element_type c) { broadcast(c); }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  TensorRegisterBase(self_type const& c) { copy(c); }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  ~TensorRegisterBase() {}
 
   /*
-   * Tensor product helper class.
+   * Overload for:    assignment of ET to a TensorRegister
+   */
+  template<typename RHS,
+           typename std::enable_if<
+               std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value,
+               bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegisterBase(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this TensorRegister
+    *this = rhs.eval(self_type::s_get_default_tile());
+  }
+
+  template<typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegisterBase(register_type reg0,
+                                                           REGS const&... regs)
+      : m_registers {reg0, regs...}
+  {
+    static_assert(1 + sizeof...(REGS) == s_num_registers,
+                  "Incompatible number of registers");
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
+
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr TensorRegisterStoreRef<REF_TYPE>
+  create_et_store_ref(REF_TYPE const& ref)
+  {
+    return TensorRegisterStoreRef<REF_TYPE> {ref};
+  }
+
+  RAJA_SUPPRESS_HD_WARN
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE static self_type s_load_ref(REF_TYPE const& ref)
+  {
+
+    self_type value;
+
+    value.load_ref(ref);
+    return value;
+  }
+
+  /*!
+   * Gets the size of the tensor
+   * Since this is a vector, just the length of the vector in dim 0
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  static constexpr int s_dim_elem(int dim)
+  {
+    return (dim == 0) ? self_type::s_num_elem : 0;
+  }
+
+  /*!
+   * Gets the default tile of this tensor
+   * That tile always start at 0, and extends to the full tile sizes
+   */
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  static constexpr StaticTensorTile<int,
+                                    TENSOR_FULL,
+                                    camp::int_seq<int, int(SIZES * 0)...>,
+                                    camp::int_seq<int, int(SIZES)...>>
+  s_get_default_tile()
+  {
+    return StaticTensorTile<int, TENSOR_FULL,
+                            camp::int_seq<int, int(SIZES * 0)...>,
+                            camp::int_seq<int, int(SIZES)...>>();
+  }
+
+  /*!
+   * @brief convenience routine to allow Vector classes to use
+   * camp::sink() across a variety of register types, and use things like
+   * ternary operators
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr bool sink() const { return false; }
+
+  /*!
+   * Copy contents of another tensor
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& c)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = c.vec(i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * Sets all elements to zero
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& clear()
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i] = register_type(0);
+    }
+
+
+    return *getThis();
+  }
+
+  /*!
+   * Copy contents of another matrix operator
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type v)
+  {
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      m_registers[i].broadcast(v);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Broadcast scalar value to first N register elements
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast_n(element_type const& value, camp::idx_t N)
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      getThis()->set(value, i);
+    }
+    return *getThis();
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.broadcast(getThis()->get(i));
+    return x;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].add(mat.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].subtract(mat.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise multiplication
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * element-wise fused multiply add
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply_add(self_type const& x, self_type const& add) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& mat) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
+    }
+    return result;
+  }
+
+  /*!
+   * @brief Dot product of two vectors
+   * @param x Other vector to dot with this vector
+   * @return Value of (*this) dot x
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  element_type dot(self_type const& x) const
+  {
+    element_type result(0);
+
+    for (camp::idx_t reg = 0; reg < s_num_registers; ++reg)
+    {
+      result += m_registers[reg].multiply(x.vec(reg)).sum();
+    }
+
+    return result;
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& operator=(element_type value)
+  {
+    getThis()->broadcast(value);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_SUPPRESS_HD_WARN
+  template<typename T2>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& operator=(
+      RAJA::expt::TensorRegister<RAJA::expt::scalar_register,
+                                 T2,
+                                 RAJA::expt::ScalarLayout,
+                                 camp::idx_seq<>> const& value)
+  {
+    getThis()->broadcast(value.get(0));
+    return *getThis();
+  }
+
+  /*!
+   * @brief Assign one register to antoher
+   * @param x Vector to copy
+   * @return Value of (*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& operator=(self_type const& x)
+  {
+    getThis()->copy(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Add two vector registers
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type operator+(self_type const& x) const { return getThis()->add(x); }
+
+  /*!
+   * @brief Add a vector to this vector
+   * @param x Vector to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator+=(self_type const& x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Add vector to a scalar
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type operator+(element_type const& x) const { return getThis()->add(x); }
+
+  /*!
+   * @brief Add a scalar to this vector
+   * @param x scalar to add to this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator+=(element_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Negate the value of this vector
+   * @return Value of -(*this)
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type operator-() const { return self_type(0).subtract(*getThis()); }
+
+  /*!
+   * @brief Subtract two vector registers
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type operator-(self_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a vector from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator-=(self_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Subtract scalar from this register
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type operator-(element_type const& x) const
+  {
+    return getThis()->subtract(x);
+  }
+
+  /*!
+   * @brief Subtract a scalar from this vector
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator-=(element_type const& x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Multiply two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE
+      typename TensorDefaultOperation<self_type, RHS>::multiply_type
+      operator*(RHS const& rhs) const
+  {
+    return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+  }
+
+  /*!
+   * @brief Multiply a vector with this vector
+   * @param x Vector to multiple with this register
+   * @return Value of (*this)+x
+   */
+  template<typename RHS>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& operator*=(RHS const& rhs)
+  {
+    *getThis() =
+        TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide two vector registers, element wise
+   * @param x Vector to subtract from this register
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type operator/(self_type const& x) const { return getThis()->divide(x); }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Vector to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator/=(self_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Divide by a scalar, element wise
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type operator/(element_type const& x) const
+  {
+    return getThis()->divide(x);
+  }
+
+  /*!
+   * @brief Divide this vector by another vector
+   * @param x Scalar to divide by
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator/=(element_type const& x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
+
+  /*!
+   * @brief Returns element wise minimum value tensor
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type vmin(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmin(x.vec(i));
+    }
+    return result;
+  }
+
+  /*!
+   * @brief Returns element wise maximum value tensor
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type vmax(self_type x) const
+  {
+    self_type result;
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      result.vec(i) = m_registers[i].vmax(x.vec(i));
+    }
+    return result;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  register_type& vec(int i) { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr register_type const& vec(int i) const { return m_registers[i]; }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  register_type& get_register(int reg) { return m_registers[reg]; }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr register_type const& get_register(int reg) const
+  {
+    return m_registers[reg];
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
    *
-   * This defines the default product operation between types when using the
-   * operator*
+   * Derived types can override this to implement intrinsic FMS's
    *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
    */
-  template<typename LHS, typename RHS>
-  struct TensorDefaultOperation{
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-      using multiply_type = decltype(LHS().multiply(RHS()));
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return getThis()->multiply_add(b, -c);
+  }
 
-      // default multiplication operator
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      multiply_type multiply(LHS const &lhs, RHS const &rhs)
-      {
-        return lhs.multiply(rhs);
-      }
+  /*!
+   * Multiply this tensor by a scalar value
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-  };
+  RAJA_HOST_DEVICE
+  self_type scale(element_type c) const
+  {
+    return getThis()->multiply(self_type(c));
+  }
 
+  /*!
+   * In-place add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-  template<typename REF_TYPE>
-  struct TensorRegisterStoreRef{
-      using self_type = TensorRegisterStoreRef<REF_TYPE>;
-      REF_TYPE m_ref;
+  RAJA_HOST_DEVICE
+  self_type& inplace_add(self_type x)
+  {
+    *getThis() = getThis()->add(x);
+    return *getThis();
+  }
 
-      RAJA_SUPPRESS_HD_WARN
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator=(RHS const &rhs)
-      {
+  /*!
+   * In-place sbutract operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-        rhs.store_ref(m_ref);
-        return *this;
-      }
-  };
+  RAJA_HOST_DEVICE
+  self_type& inplace_subtract(self_type x)
+  {
+    *getThis() = getThis()->subtract(x);
+    return *getThis();
+  }
 
-  template<camp::idx_t N, camp::idx_t D>
-  struct DivideRoundUp {
-      static constexpr camp::idx_t value =
-          (N % D) > 0 ? (1 + N/D) : (N/D);
-  };
+  /*!
+   * In-place multiply operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply(self_type x)
+  {
+    *getThis() = getThis()->multiply(x);
+    return *getThis();
+  }
 
-  class TensorRegisterConcreteBase {};
+  /*!
+   * In-place multiply-add operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_add(self_type x, self_type y)
+  {
+    *getThis() = getThis()->multiply_add(x, y);
+    return *getThis();
+  }
 
   /*!
-   * TensorRegister base class that provides some default behaviors and simplifies
-   * the implementation of new register types.
-   *
-   * This uses CRTP to provide static polymorphism
+   * In-place multiply-subtract operation
    */
-  template<typename Derived>
-  class TensorRegisterBase;
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-  template<typename REGISTER_POLICY, typename T, typename LAYOUT, typename camp::idx_t ... SIZES>
-  class TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>> :
-    public TensorRegisterConcreteBase
+  RAJA_HOST_DEVICE
+  self_type& inplace_multiply_subtract(self_type x, self_type y)
   {
-    public:
-      using self_type = RAJA::expt::TensorRegister<REGISTER_POLICY, T, LAYOUT, camp::idx_seq<SIZES...>>;
-      using element_type = camp::decay<T>;
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(SIZES);
+    *getThis() = getThis()->multiply_subtract(x, y);
+    return *getThis();
+  }
 
-      static constexpr camp::idx_t s_num_registers = DivideRoundUp<RAJA::product<camp::idx_t>(SIZES...), RegisterTraits<REGISTER_POLICY,T>::s_num_elem>::value;
-
-      using index_type = camp::idx_t;
-
-      using register_type = RAJA::expt::Register<T, REGISTER_POLICY>;
-
-      using register_policy = REGISTER_POLICY;
-
-    private:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type *getThis(){
-        return static_cast<self_type *>(this);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      self_type const *getThis() const{
-        return static_cast<self_type const *>(this);
-      }
-
-    protected:
-
-      register_type m_registers[s_num_registers];
-
-    public:
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegisterBase(){}
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(element_type c)
-      {
-        broadcast(c);
-      }
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(self_type const &c)
-      {
-        copy(c);
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      ~TensorRegisterBase(){}
-
-
-      /*
-       * Overload for:    assignment of ET to a TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegisterBase(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this TensorRegister
-        *this = rhs.eval(self_type::s_get_default_tile());
-      }
-
-
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegisterBase(register_type reg0, REGS const &... regs) :
-        m_registers{reg0, regs...}
-      {
-        static_assert(1+sizeof...(REGS) == s_num_registers,
-            "Incompatible number of registers");
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      TensorRegisterStoreRef<REF_TYPE>
-      create_et_store_ref(REF_TYPE const &ref) {
-        return TensorRegisterStoreRef<REF_TYPE>{ref};
-      }
-
-      RAJA_SUPPRESS_HD_WARN
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      self_type
-      s_load_ref(REF_TYPE const &ref) {
-
-        self_type value;
-
-        value.load_ref(ref);
-        return value;
-      }
-
-      /*!
-       * Gets the size of the tensor
-       * Since this is a vector, just the length of the vector in dim 0
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr int s_dim_elem(int dim){
-        return (dim==0) ? self_type::s_num_elem : 0;
-      }
-
-
-      /*!
-       * Gets the default tile of this tensor
-       * That tile always start at 0, and extends to the full tile sizes
-       */
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>
-      s_get_default_tile()
-      {
-        return StaticTensorTile<int, TENSOR_FULL, camp::int_seq<int,int(SIZES*0)...>, camp::int_seq<int,int(SIZES)...>>();
-      }
-
-      /*!
-       * @brief convenience routine to allow Vector classes to use
-       * camp::sink() across a variety of register types, and use things like
-       * ternary operators
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      bool sink() const{
-        return false;
-      }
-
-
-
-
-
-
-      /*!
-       * Copy contents of another tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &c){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = c.vec(i);
-        }
-        return *getThis();
-      }
-
-
-
-
-      /*!
-       * Sets all elements to zero
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &clear(){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i] = register_type(0);
-        }
-
-
-        return *getThis();
-      }
-
-
-      /*!
-       * Copy contents of another matrix operator
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type v){
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          m_registers[i].broadcast(v);
-        }
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Broadcast scalar value to first N register elements
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast_n(element_type const &value, camp::idx_t N){
-        for(camp::idx_t i = 0;i < N;++ i){
-          getThis()->set(value, i);
-        }
-        return *getThis();
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.broadcast(getThis()->get(i));
-        return x;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].add(mat.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].subtract(mat.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * element-wise multiplication
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply(x.vec(i));
-        }
-        return result;
-      }
-
-      /*!
-       * element-wise fused multiply add
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply_add(self_type const &x, self_type const &add) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].multiply_add(x.vec(i), add.vec(i));
-        }
-        return result;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &mat) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(mat.vec(reg));
-        }
-        return result;
-      }
-
-
-
-      /*!
-       * @brief Dot product of two vectors
-       * @param x Other vector to dot with this vector
-       * @return Value of (*this) dot x
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type dot(self_type const &x) const
-      {
-        element_type result(0);
-
-        for(camp::idx_t reg = 0;reg < s_num_registers;++ reg){
-          result += m_registers[reg].multiply(x.vec(reg)).sum();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(element_type value)
-      {
-        getThis()->broadcast(value);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_SUPPRESS_HD_WARN
-      template<typename T2>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(RAJA::expt::TensorRegister<RAJA::expt::scalar_register, T2, RAJA::expt::ScalarLayout, camp::idx_seq<>> const &value)
-      {
-        getThis()->broadcast(value.get(0));
-        return *getThis();
-      }
-
-      /*!
-       * @brief Assign one register to antoher
-       * @param x Vector to copy
-       * @return Value of (*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &operator=(self_type const &x)
-      {
-        getThis()->copy(x);
-        return *getThis();
-      }
-
-
-
-
-
-      /*!
-       * @brief Add two vector registers
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(self_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a vector to this vector
-       * @param x Vector to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(self_type const &x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Add vector to a scalar
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator+(element_type const &x) const
-      {
-        return getThis()->add(x);
-      }
-
-
-      /*!
-       * @brief Add a scalar to this vector
-       * @param x scalar to add to this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator+=(element_type x)
-      {
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Negate the value of this vector
-       * @return Value of -(*this)
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-() const
-      {
-        return self_type(0).subtract(*getThis());
-      }
-
-      /*!
-       * @brief Subtract two vector registers
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(self_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a vector from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(self_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Subtract scalar from this register
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator-(element_type const &x) const
-      {
-        return getThis()->subtract(x);
-      }
-
-      /*!
-       * @brief Subtract a scalar from this vector
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator-=(element_type const &x)
-      {
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Multiply two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      typename TensorDefaultOperation<self_type, RHS>::multiply_type
-      operator*(RHS const &rhs) const
-      {
-        return TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-      }
-
-      /*!
-       * @brief Multiply a vector with this vector
-       * @param x Vector to multiple with this register
-       * @return Value of (*this)+x
-       */
-      template<typename RHS>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator*=(RHS const &rhs)
-      {
-        *getThis() = TensorDefaultOperation<self_type, RHS>::multiply(*getThis(), rhs);
-        return *getThis();
-      }
-
-      /*!
-       * @brief Divide two vector registers, element wise
-       * @param x Vector to subtract from this register
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(self_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Vector to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(self_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Divide by a scalar, element wise
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type operator/(element_type const &x) const
-      {
-        return getThis()->divide(x);
-      }
-
-      /*!
-       * @brief Divide this vector by another vector
-       * @param x Scalar to divide by
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator/=(element_type const &x)
-      {
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-
-      /*!
-       * @brief Returns element wise minimum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmin(x.vec(i));
-        }
-        return result;
-      }
-
-
-      /*!
-       * @brief Returns element wise maximum value tensor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type x) const {
-        self_type result;
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          result.vec(i) = m_registers[i].vmax(x.vec(i));
-        }
-        return result;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &vec(int i){
-        return m_registers[i];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &vec(int i) const{
-        return m_registers[i];
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      register_type &get_register(int reg){
-        return m_registers[reg];
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      register_type const &get_register(int reg) const{
-        return m_registers[reg];
-      }
-
-
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return getThis()->multiply_add(b, -c);
-      }
-
-      /*!
-       * Multiply this tensor by a scalar value
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type scale(element_type c) const
-      {
-        return getThis()->multiply(self_type(c));
-      }
-
-
-      /*!
-       * In-place add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_add(self_type x){
-        *getThis() = getThis()->add(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place sbutract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_subtract(self_type x){
-        *getThis() = getThis()->subtract(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply(self_type x){
-        *getThis() = getThis()->multiply(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-add operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_add(self_type x, self_type y){
-        *getThis() = getThis()->multiply_add(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place multiply-subtract operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_multiply_subtract(self_type x, self_type y){
-        *getThis() = getThis()->multiply_subtract(x,y);
-        return *getThis();
-      }
-
-      /*!
-       * In-place divide operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_divide(self_type x){
-        *getThis() = getThis()->divide(x);
-        return *getThis();
-      }
-
-      /*!
-       * In-place scaling operation
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &inplace_scale(element_type x){
-        *getThis() = getThis()->scale(x);
-        return *getThis();
-      }
-
-  };
-
-} //namespace internal
-
-} // namespace expt
+  /*!
+   * In-place divide operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
 
-}  // namespace RAJA
+  RAJA_HOST_DEVICE
+  self_type& inplace_divide(self_type x)
+  {
+    *getThis() = getThis()->divide(x);
+    return *getThis();
+  }
 
+  /*!
+   * In-place scaling operation
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type& inplace_scale(element_type x)
+  {
+    *getThis() = getThis()->scale(x);
+    return *getThis();
+  }
+};
+
+}  // namespace expt
+
+}  // namespace internal
+
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
index 3899a97118..6a1035adea 100644
--- a/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
+++ b/include/RAJA/pattern/tensor/internal/TensorTileExec.hpp
@@ -33,345 +33,351 @@ namespace expt
 {
 
 
+template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
+template<typename STORAGE, typename DIM_SEQ>
+struct TensorTileExec;
 
-    template<typename STORAGE, typename DIM_SEQ>
-    struct TensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST>
-    struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>{
-
-      using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE const &otile, TTYPE &tile, BODY && body){
+/**
+ * Implement a dimension tiling loop
+ */
+template<typename STORAGE, camp::idx_t DIM0, camp::idx_t... DIM_REST>
+struct TensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>>
+{
 
-        auto const orig_begin = otile.m_begin[DIM0];
-        auto const orig_size =  otile.m_size[DIM0];
+  using inner_t = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
 
-        // Do the full tile sizes
-        for(tile.m_begin[DIM0] = orig_begin;
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
+                                                TTYPE& tile,
+                                                BODY&& body)
+  {
 
-            tile.m_begin[DIM0] +  STORAGE::s_dim_elem(DIM0) <=
-                orig_begin+orig_size;
+    auto const orig_begin = otile.m_begin[DIM0];
+    auto const orig_size  = otile.m_size[DIM0];
 
-            tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0)){
+    // Do the full tile sizes
+    for (tile.m_begin[DIM0] = orig_begin;
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, tile, body);
+         tile.m_begin[DIM0] + STORAGE::s_dim_elem(DIM0) <=
+         orig_begin + orig_size;
 
-        }
+         tile.m_begin[DIM0] += STORAGE::s_dim_elem(DIM0))
+    {
 
-        // Postamble if needed
-        if(tile.m_begin[DIM0] <
-            orig_begin + orig_size)
-        {
+      // Do the next inner tiling loop
+      inner_t::exec(otile, tile, body);
+    }
 
-          // convert tile to a partial tile
-          auto &part_tile = make_tensor_tile_partial(tile);
+    // Postamble if needed
+    if (tile.m_begin[DIM0] < orig_begin + orig_size)
+    {
 
-          // store original size
-          auto tmp_size = part_tile.m_size[DIM0];
+      // convert tile to a partial tile
+      auto& part_tile = make_tensor_tile_partial(tile);
 
-          // set tile size to the remainder
-          part_tile.m_size[DIM0] =
-              orig_begin +
-              orig_size -
-              tile.m_begin[DIM0];
+      // store original size
+      auto tmp_size = part_tile.m_size[DIM0];
 
-          // Do the next inner tiling loop
-          inner_t::exec(otile, part_tile, body);
+      // set tile size to the remainder
+      part_tile.m_size[DIM0] = orig_begin + orig_size - tile.m_begin[DIM0];
 
-          // restore size
-          part_tile.m_size[DIM0] = tmp_size;
-        }
+      // Do the next inner tiling loop
+      inner_t::exec(otile, part_tile, body);
 
-        // reset tile dimension
-        tile.m_begin[DIM0] = orig_begin;
+      // restore size
+      part_tile.m_size[DIM0] = tmp_size;
+    }
 
-      }
+    // reset tile dimension
+    tile.m_begin[DIM0] = orig_begin;
+  }
 
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const& otile,
+                                                       TTYPE const& tile,
+                                                       BODY&& body)
+  {
 
 
-      template<
-          typename OTILE,
-          typename TTYPE,
-          typename BODY
-      >
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void
-      static_exec(
-          OTILE const &otile,
-          TTYPE const &tile,
-          BODY && body
-      ){
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-        auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-        auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
+    auto constexpr step_size = STORAGE::s_dim_elem(DIM0);
 
-        auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    auto constexpr iter_count =
+        (tile_begin >= orig_begin) && (tile_begin < (orig_begin + orig_size))
+            ? ((orig_begin + orig_size) - tile_begin + step_size - 1) /
+                  step_size
+            : 0;
 
-        auto constexpr step_size  = STORAGE::s_dim_elem(DIM0);
 
-        auto constexpr iter_count =
-               (tile_begin >= orig_begin) && (tile_begin < (orig_begin+orig_size))
-                 ? ((orig_begin + orig_size) - tile_begin + step_size - 1) / step_size
-                 : 0;
+    using IterCount =
+        camp::integral_constant<typename TTYPE::index_type, iter_count>;
+    using DimSeq = camp::idx_seq<DIM0, DIM_REST...>;
+    using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,
+                                                  IterCount>::type;
 
+    StaticTensorTileExec<STORAGE, DimSeq, IdxSeq>::exec(otile, tile, body);
+  }
+};
 
-        using IterCount = camp::integral_constant<typename TTYPE::index_type,iter_count>;
-        using DimSeq = camp::idx_seq<DIM0,DIM_REST...>;
-        using IdxSeq = typename camp::detail::gen_seq<typename TTYPE::index_type,IterCount>::type;
+/**
+ * Termination of nested loop:  execute evaluation of ET
+ */
+template<typename STORAGE>
+struct TensorTileExec<STORAGE, camp::idx_seq<>>
+{
 
-        StaticTensorTileExec<STORAGE,DimSeq,IdxSeq>::exec(otile,tile,body);
-        
-      }
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE&,
+                                                TTYPE const& tile,
+                                                BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void static_exec(OTILE const&,
+                                                       TTYPE const& tile,
+                                                       BODY&& body)
+  {
+
+    // execute body, passing in the current tile
+    body(tile);
+  }
+};
+
+template<typename STORAGE,
+         typename TILE_TYPE,
+         typename BODY,
+         camp::idx_t... IDX_SEQ,
+         camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    TILE_TYPE const& orig_tile,
+    BODY&& body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
+{
 
+  // tile over full rows and columns
+  // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
+  TILE_TYPE tile {
+      {orig_tile.m_begin[IDX_SEQ]...},
+      {STORAGE::s_dim_elem(IDX_SEQ)...},
+  };
 
 
-    };
+  // Promote the tile type to a "full-tile" so that the full-element
+  // register operations are used.
+  // Any of the tiling loops can demote this to a partial-tile when
+  // they do postamble execution
+  auto& full_tile = make_tensor_tile_full(tile);
 
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
-    /**
-     * Termination of nested loop:  execute evaluation of ET
-     */
-    template<typename STORAGE>
-    struct TensorTileExec<STORAGE, camp::idx_seq<>>{
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void exec(OTILE &, TTYPE const &tile, BODY && body){
+  tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+}
 
-        // execute body, passing in the current tile
-        body(tile);
 
-      }
+template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
+struct StaticTensorTileExec;
 
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      void static_exec(OTILE const &, TTYPE const &tile, BODY && body){
+/**
+ * Implement a dimension tiling loop
+ */
 
-        // execute body, passing in the current tile
-        body(tile);
+template<typename STORAGE,
+         camp::idx_t DIM0,
+         camp::idx_t... DIM_REST,
+         camp::idx_t IDX,
+         camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0, DIM_REST...>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
 
-      }
+  using DimList = camp::idx_seq<DIM0, DIM_REST...>;
+  using DimTail = camp::idx_seq<DIM_REST...>;
+  using IdxList = camp::idx_seq<IDX, IDX_REST...>;
+  using IdxTail = camp::idx_seq<IDX_REST...>;
 
-    };
+  using DownExec = TensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>>;
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0, DIM_REST...>,
+                                        camp::idx_seq<IDX_REST...>>;
 
+  static auto const step_size = STORAGE::s_dim_elem(DIM0);
 
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
+                                                TTYPE const& tile,
+                                                BODY&& body)
+  {
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded(TILE_TYPE const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
-    {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-      // tile over full rows and columns
-      // tile_type tile{{0,0},{row_tile_size, col_tile_size}};
-      TILE_TYPE tile {
-        {orig_tile.m_begin[IDX_SEQ]...},
-        {STORAGE::s_dim_elem(IDX_SEQ)...},
-      };
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-      // Promote the tile type to a "full-tile" so that the full-element
-      // register operations are used.
-      // Any of the tiling loops can demote this to a partial-tile when
-      // they do postamble execution
-      auto &full_tile = make_tensor_tile_full(tile);
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
 
-      tensor_tile_exec_t::exec(orig_tile, full_tile, body);
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec DOWN");
 
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
+    {
+      DownExec::static_exec(otile, tile, body);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
     }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      DownExec::static_exec(otile, part_tile, body);
+    }
+  }
+};
+
+template<typename STORAGE,
+         camp::idx_t DIM0,
+         camp::idx_t IDX,
+         camp::idx_t... IDX_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM0>,
+                            camp::idx_seq<IDX, IDX_REST...>>
+{
+  using NextExec = StaticTensorTileExec<STORAGE,
+                                        camp::idx_seq<DIM0>,
+                                        camp::idx_seq<IDX_REST...>>;
 
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const& otile,
+                                                TTYPE const& tile,
+                                                BODY&& body)
+  {
+    auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
+    auto constexpr orig_size  = OTILE::size_type::value_at(DIM0);
 
-    template<typename STORAGE, typename DIM_SEQ, typename IDX_SEQ>
-    struct StaticTensorTileExec;
-
-    /**
-     * Implement a dimension tiling loop
-     */
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t ... DIM_REST, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0, DIM_REST...>,camp::idx_seq<IDX,IDX_REST...>>{
-
-          using DimList  = camp::idx_seq<DIM0, DIM_REST...>;
-          using DimTail  = camp::idx_seq<      DIM_REST...>;
-          using IdxList  = camp::idx_seq<IDX , IDX_REST...>;
-          using IdxTail  = camp::idx_seq<      IDX_REST...>;
-
-          using DownExec = TensorTileExec<STORAGE,camp::idx_seq<DIM_REST...>>;
-          using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0,DIM_REST...>,camp::idx_seq<IDX_REST...>>;
-
-          static auto const step_size = STORAGE::s_dim_elem(DIM0);
-
-          template<
-              typename OTILE,
-              typename TTYPE,
-              typename BODY
-          >
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static
-          void
-          exec(
-              OTILE const &otile,
-              TTYPE const &tile,
-              BODY && body
-          ){
-    
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
-
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
-
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
-
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
-
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec DOWN" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               DownExec::static_exec(otile, tile, body);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               DownExec::static_exec(otile,part_tile,body);
-            }
-    
-          }
-
-
-
-    };
-
-
-
-    template<typename STORAGE, camp::idx_t DIM0, camp::idx_t IDX, camp::idx_t ... IDX_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM0>, camp::idx_seq<IDX,IDX_REST...>>{
-      using NextExec = StaticTensorTileExec<STORAGE,camp::idx_seq<DIM0>,camp::idx_seq<IDX_REST...>>;
-
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const & otile, TTYPE const &tile, BODY && body) {
-            auto constexpr orig_begin = OTILE::begin_type::value_at(DIM0);
-            auto constexpr orig_size =  OTILE:: size_type::value_at(DIM0);
-    
-            auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
+    auto constexpr tile_begin = TTYPE::begin_type::value_at(DIM0);
 
-            using NextBegin = camp::integral_constant<typename TTYPE::index_type,tile_begin+STORAGE::s_dim_elem(DIM0)>;
-            using TailSize  = camp::integral_constant<typename TTYPE::index_type,(orig_begin+orig_size)-tile_begin>;
+    using NextBegin =
+        camp::integral_constant<typename TTYPE::index_type,
+                                tile_begin + STORAGE::s_dim_elem(DIM0)>;
+    using TailSize =
+        camp::integral_constant<typename TTYPE::index_type,
+                                (orig_begin + orig_size) - tile_begin>;
 
-            using NextTile  = typename expt::SetStaticTensorTileBegin<TTYPE,NextBegin,(size_t)DIM0>::Type;
+    using NextTile =
+        typename expt::SetStaticTensorTileBegin<TTYPE, NextBegin,
+                                                (size_t)DIM0>::Type;
 
-            using TailTile  = typename expt::SetStaticTensorTileSize <TTYPE,TailSize ,(size_t)DIM0>::Type;
-            using PartTile  = typename TailTile::Partial;
+    using TailTile = typename expt::SetStaticTensorTileSize<TTYPE, TailSize,
+                                                            (size_t)DIM0>::Type;
+    using PartTile = typename TailTile::Partial;
 
-    
-            static_assert( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size+ STORAGE::s_dim_elem(DIM0) ), "OOB StaticTensorTileExec ACROSS" );
-     
-            if( (tile_begin + STORAGE::s_dim_elem(DIM0) ) <= (orig_begin + orig_size) ){
-               body(tile);
-               NextTile next_tile;
-               NextExec::exec(otile, next_tile, body);
-            } else if ( tile_begin < (orig_begin + orig_size ) ) {
-               PartTile part_tile;
-               body(part_tile);
-            }
-      }
 
-    };
-
-    template<typename STORAGE, camp::idx_t ... DIM_REST>
-    struct StaticTensorTileExec<STORAGE, camp::idx_seq<DIM_REST...>, camp::idx_seq<> >{
-
-      template<typename OTILE, typename TTYPE, typename BODY>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static void exec(OTILE const &, TTYPE const &, BODY &&) {}
-
-    };
+    static_assert((tile_begin + STORAGE::s_dim_elem(DIM0)) <=
+                      (orig_begin + orig_size + STORAGE::s_dim_elem(DIM0)),
+                  "OOB StaticTensorTileExec ACROSS");
 
-
-
-    template<typename STORAGE, typename INDEX_TYPE, TensorTileSize TENSOR_SIZE, typename TBEGIN, typename TSIZE, typename BODY, camp::idx_t ... IDX_SEQ, camp::idx_t ... DIM_SEQ>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec_expanded( StaticTensorTile<INDEX_TYPE,TENSOR_SIZE, TBEGIN, TSIZE> const &orig_tile, BODY && body, camp::idx_seq<IDX_SEQ...> const &, camp::idx_seq<DIM_SEQ...> const &)
+    if ((tile_begin + STORAGE::s_dim_elem(DIM0)) <= (orig_begin + orig_size))
     {
+      body(tile);
+      NextTile next_tile;
+      NextExec::exec(otile, next_tile, body);
+    }
+    else if (tile_begin < (orig_begin + orig_size))
+    {
+      PartTile part_tile;
+      body(part_tile);
+    }
+  }
+};
 
-      using InputType = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_SIZE,
-          TBEGIN,
-          TSIZE
-      >;
-
-      using InputBegin = typename InputType::begin_type;
+template<typename STORAGE, camp::idx_t... DIM_REST>
+struct StaticTensorTileExec<STORAGE,
+                            camp::idx_seq<DIM_REST...>,
+                            camp::idx_seq<>>
+{
 
-      using Type = StaticTensorTile<
-          INDEX_TYPE,
-          TENSOR_FULL,
-          camp::int_seq<INDEX_TYPE,InputBegin::value_at(IDX_SEQ)...>,
-          camp::int_seq<INDEX_TYPE,STORAGE::s_dim_elem(IDX_SEQ)...>
-      >;
+  template<typename OTILE, typename TTYPE, typename BODY>
+  RAJA_HOST_DEVICE RAJA_INLINE static void exec(OTILE const&,
+                                                TTYPE const&,
+                                                BODY&&)
+  {}
+};
+
+template<typename STORAGE,
+         typename INDEX_TYPE,
+         TensorTileSize TENSOR_SIZE,
+         typename TBEGIN,
+         typename TSIZE,
+         typename BODY,
+         camp::idx_t... IDX_SEQ,
+         camp::idx_t... DIM_SEQ>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec_expanded(
+    StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE> const& orig_tile,
+    BODY&& body,
+    camp::idx_seq<IDX_SEQ...> const&,
+    camp::idx_seq<DIM_SEQ...> const&)
+{
 
-      Type full_tile;
+  using InputType = StaticTensorTile<INDEX_TYPE, TENSOR_SIZE, TBEGIN, TSIZE>;
 
-      // Do all of the tiling loops in layout order, this may improve
-      // cache performance
-      using layout_order = typename STORAGE::layout_type::seq_t;
-      using tensor_tile_exec_t =
-             TensorTileExec<STORAGE, layout_order>;
+  using InputBegin = typename InputType::begin_type;
 
+  using Type = StaticTensorTile<
+      INDEX_TYPE, TENSOR_FULL,
+      camp::int_seq<INDEX_TYPE, InputBegin::value_at(IDX_SEQ)...>,
+      camp::int_seq<INDEX_TYPE, STORAGE::s_dim_elem(IDX_SEQ)...>>;
 
-      tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+  Type full_tile;
 
-    }
+  // Do all of the tiling loops in layout order, this may improve
+  // cache performance
+  using layout_order       = typename STORAGE::layout_type::seq_t;
+  using tensor_tile_exec_t = TensorTileExec<STORAGE, layout_order>;
 
 
+  tensor_tile_exec_t::static_exec(orig_tile, full_tile, body);
+}
 
-    template<typename STORAGE, typename TILE_TYPE, typename BODY>
-    RAJA_INLINE
-    RAJA_HOST_DEVICE
-    void tensorTileExec(TILE_TYPE const &tile, BODY && body)
-    {
-      using layout_type = typename STORAGE::layout_type;
-      tensorTileExec_expanded<STORAGE>(tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims>{}, layout_type{});
-    }
+template<typename STORAGE, typename TILE_TYPE, typename BODY>
+RAJA_INLINE RAJA_HOST_DEVICE void tensorTileExec(TILE_TYPE const& tile,
+                                                 BODY&& body)
+{
+  using layout_type = typename STORAGE::layout_type;
+  tensorTileExec_expanded<STORAGE>(
+      tile, body, camp::make_idx_seq_t<STORAGE::s_num_dims> {}, layout_type {});
+}
 
-  } // namespace internal
-} // namespace expt
+}  // namespace expt
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
index 4ef4998fbe..b75457ad45 100644
--- a/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
+++ b/include/RAJA/pattern/tensor/internal/VectorRegisterImpl.hpp
@@ -27,967 +27,1084 @@
 #include "RAJA/pattern/tensor/stats.hpp"
 #include "RAJA/util/BitMask.hpp"
 
-
 namespace RAJA
 {
 
 namespace expt
 {
 
-  /*!
-   * This provides a Tensor specialization for vectors
-   */
-  template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
-  class TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>> :
-    public internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>
+/*!
+ * This provides a Tensor specialization for vectors
+ */
+template<typename REGISTER_POLICY, typename T, camp::idx_t SIZE>
+class TensorRegister<REGISTER_POLICY,
+                     T,
+                     RAJA::expt::VectorLayout,
+                     camp::idx_seq<SIZE>>
+    : public internal::expt::TensorRegisterBase<
+          RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                     T,
+                                     RAJA::expt::VectorLayout,
+                                     camp::idx_seq<SIZE>>>
+{
+public:
+  using self_type = TensorRegister<REGISTER_POLICY,
+                                   T,
+                                   RAJA::expt::VectorLayout,
+                                   camp::idx_seq<SIZE>>;
+  using base_type = internal::expt::TensorRegisterBase<
+      RAJA::expt::TensorRegister<REGISTER_POLICY,
+                                 T,
+                                 RAJA::expt::VectorLayout,
+                                 camp::idx_seq<SIZE>>>;
+  using element_type  = camp::decay<T>;
+  using layout_type   = TensorLayout<0>;
+  using register_type = Register<T, REGISTER_POLICY>;
+
+  static constexpr camp::idx_t s_num_elem = SIZE;
+
+  using int_element_type =
+      typename register_type::int_vector_type::element_type;
+  using int_vector_type = TensorRegister<REGISTER_POLICY,
+                                         int_element_type,
+                                         RAJA::expt::VectorLayout,
+                                         camp::idx_seq<SIZE>>;
+
+private:
+  static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+
+  static constexpr camp::idx_t s_num_full_registers =
+      s_num_elem / s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_partial_lanes =
+      s_num_elem % s_register_num_elem;
+
+  static constexpr camp::idx_t s_num_registers = (s_num_partial_lanes > 0)
+                                                     ? s_num_full_registers + 1
+                                                     : s_num_full_registers;
+
+  using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+
+  static constexpr camp::idx_t s_shift_per_register = log_base2_t::value;
+
+  static constexpr camp::idx_t s_mask_per_register =
+      (1 << log_base2_t::value) - 1;
+
+  // Offset of last regiser in m_registers
+  static constexpr camp::idx_t s_final_register = s_num_partial_lanes == 0
+                                                      ? s_num_full_registers - 1
+                                                      : s_num_full_registers;
+
+  template<typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_register(IDX i) -> IDX
   {
-    public:
-      using self_type = TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
-      using base_type = internal::expt::TensorRegisterBase<RAJA::expt::TensorRegister<REGISTER_POLICY, T, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>>;
-      using element_type = camp::decay<T>;
-      using layout_type = TensorLayout<0>;
-      using register_type = Register<T, REGISTER_POLICY>;
+    return i >> IDX(s_shift_per_register);
+  }
 
-      static constexpr camp::idx_t s_num_elem = SIZE;
+  template<typename IDX>
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr static auto to_lane(IDX i) -> IDX
+  {
+    return i & IDX(s_mask_per_register);
+  }
 
-      using int_element_type = typename register_type::int_vector_type::element_type;
-      using int_vector_type = TensorRegister<REGISTER_POLICY, int_element_type, RAJA::expt::VectorLayout, camp::idx_seq<SIZE>>;
+  using base_type::m_registers;
 
-    private:
+public:
+  RAJA_HOST_DEVICE
 
-      static constexpr camp::idx_t s_register_num_elem = register_type::s_num_elem;
+  RAJA_INLINE
+  constexpr TensorRegister() {}
 
-      static constexpr camp::idx_t s_num_full_registers = s_num_elem/s_register_num_elem;
+  RAJA_HOST_DEVICE
 
-      static constexpr camp::idx_t s_num_partial_lanes =  s_num_elem%s_register_num_elem;
+  RAJA_INLINE
+  TensorRegister(element_type c) { this->broadcast(c); }
 
-      static constexpr camp::idx_t s_num_registers =
-          (s_num_partial_lanes > 0) ?
-              s_num_full_registers + 1 :
-              s_num_full_registers;
+  RAJA_INLINE
 
-      using log_base2_t = RAJA::LogBase2<s_register_num_elem>;
+  RAJA_HOST_DEVICE
+  TensorRegister(self_type const& c) : base_type(c) {}
 
-      static constexpr camp::idx_t s_shift_per_register =
-          log_base2_t::value;
+  /*
+   * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
+   */
+  template<typename RHS,
+           typename std::enable_if<
+               std::is_base_of<
+                   RAJA::internal::expt::ET::TensorExpressionConcreteBase,
+                   RHS>::value,
+               bool>::type = true>
+  RAJA_INLINE RAJA_HOST_DEVICE TensorRegister(RHS const& rhs)
+  {
+    // evaluate a single tile of the ET, storing in this
+    // RAJA::expt::TensorRegister
+    *this = rhs.eval(base_type::s_get_default_tile());
+  }
 
-      static constexpr camp::idx_t s_mask_per_register =
-          (1<<log_base2_t::value)-1;
+  template<typename... REGS>
+  explicit RAJA_HOST_DEVICE RAJA_INLINE TensorRegister(register_type reg0,
+                                                       REGS const&... regs)
+      : base_type(reg0, regs...)
+  {}
 
-      // Offset of last regiser in m_registers
-      static constexpr camp::idx_t s_final_register =
-          s_num_partial_lanes == 0 ?
-              s_num_full_registers-1 : s_num_full_registers;
+  RAJA_HOST_DEVICE
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_register(IDX i) -> IDX {
-        return i >> IDX(s_shift_per_register);
-      }
+  RAJA_INLINE
+  static constexpr bool is_root() { return register_type::is_root(); }
 
-      template<typename IDX>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      constexpr
-      static
-      auto to_lane(IDX i) -> IDX {
-        return i & IDX(s_mask_per_register);
-      }
+  /*!
+   * Returns true if the underlying data packed for a given tensor ref
+   *
+   * This is true if either:
+   *   It's column major and the rows are stride one
+   *   It's row major and the columns are stride one
+   */
+  template<camp::idx_t STRIDE_ONE_DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE static constexpr bool is_ref_packed()
+  {
+    return STRIDE_ONE_DIM == 0;
+  }
 
+  /*!
+   * Gets the maximum size of matrix along specified dimension
+   */
+  RAJA_HOST_DEVICE
 
-      using base_type::m_registers;
+  RAJA_INLINE
+  static constexpr camp::idx_t s_dim_elem(camp::idx_t dim)
+  {
+    return dim == 0 ? s_num_elem : 0;
+  }
 
-    public:
+  /*!
+   * @brief Set entire vector to a single scalar value
+   * @param value Value to set all vector elements to
+   */
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type& operator=(element_type value)
+  {
+    this->broadcast(value);
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      TensorRegister(){}
+  RAJA_INLINE
+  self_type& operator=(self_type const& c) { return this->copy(c); }
 
+  /*!
+   * Provide left vector-matrix multiply for operator* between
+   * this vector and a matrix
+   */
+  template<typename T2, typename L, typename RP>
+  self_type operator*(SquareMatrixRegister<T2, L, RP> const& y) const
+  {
+    return y.left_vector_multiply(*this);
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(element_type c)
-      {
-        this->broadcast(c);
-      }
 
+  template<typename REF_TYPE>
+  struct RefBridge;
 
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(self_type const &c) :
-        base_type(c)
-      {
-      }
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type& load_ref(REF_TYPE const& ref)
+  {
+    RefBridge<REF_TYPE>::load_ref(*this, ref);
+    return *this;
+  }
 
-      /*
-       * Overload for:    assignment of ET to a RAJA::expt::TensorRegister
-       */
-      template<typename RHS,
-        typename std::enable_if<std::is_base_of<RAJA::internal::expt::ET::TensorExpressionConcreteBase, RHS>::value, bool>::type = true>
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      TensorRegister(RHS const &rhs)
-      {
-        // evaluate a single tile of the ET, storing in this RAJA::expt::TensorRegister
-        *this = rhs.eval(base_type::s_get_default_tile());
-      }
+  template<typename REF_TYPE>
+  RAJA_HOST_DEVICE RAJA_INLINE self_type const& store_ref(REF_TYPE& ref) const
+  {
+    RefBridge<REF_TYPE>::store_ref(*this, ref);
+    return *this;
+  }
+
+  template<typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+           camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<
+      RAJA::internal::expt::
+          TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
+  {
 
+    using RefType = RAJA::internal::expt::
+        TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
 
-      template<typename ... REGS>
-      explicit
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      TensorRegister(register_type reg0, REGS const &... regs) :
-        base_type(reg0, regs...)
-      {
-      }
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return register_type::is_root();
-      }
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * Returns true if the underlying data packed for a given tensor ref
-       *
-       * This is true if either:
-       *   It's column major and the rows are stride one
-       *   It's row major and the columns are stride one
-       */
-      template<camp::idx_t STRIDE_ONE_DIM>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_ref_packed() {
-        return STRIDE_ONE_DIM == 0;
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
+      }
+      // strided data
+      else
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
 
+    /*!
+     * @brief Performs load specified by TensorRef object.
+     */
+    RAJA_HOST_DEVICE
 
-      /*!
-       * Gets the maximum size of matrix along specified dimension
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      static
-      constexpr camp::idx_t s_dim_elem(camp::idx_t dim){
-        return dim == 0 ? s_num_elem : 0;
-      }
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-      /*!
-       * @brief Set entire vector to a single scalar value
-       * @param value Value to set all vector elements to
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(element_type value)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        this->broadcast(value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        return this->copy(c);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
+        }
       }
-
-      /*!
-       * Provide left vector-matrix multiply for operator* between
-       * this vector and a matrix
-       */
-      template<typename T2, typename L, typename RP>
-      self_type
-      operator*(SquareMatrixRegister<T2, L, RP> const &y) const
+      // strided data
+      else
       {
-        return y.left_vector_multiply(*this);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
+        }
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
+        }
       }
+    }
+  };
 
+  template<typename POINTER_TYPE,
+           typename INDEX_TYPE,
+           RAJA::internal::expt::TensorTileSize TENSOR_SIZE,
+           INDEX_TYPE STRIDE_VALUE,
+           INDEX_TYPE BEGIN_VALUE,
+           INDEX_TYPE SIZE_VALUE,
+           camp::idx_t STRIDE_ONE_DIM>
+  struct RefBridge<RAJA::internal::expt::StaticTensorRef<
+      POINTER_TYPE,
+      INDEX_TYPE,
+      TENSOR_SIZE,
+      camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+      camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+      camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+      STRIDE_ONE_DIM>>
+  {
 
-      template<typename REF_TYPE>
-      struct RefBridge;
-
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type& load_ref (REF_TYPE const &ref){
-          RefBridge<REF_TYPE>::load_ref(*this,ref);
-          return *this;
-      }
-
-      template<typename REF_TYPE>
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_ref (REF_TYPE &ref) const {
-          RefBridge<REF_TYPE>::store_ref(*this,ref);
-          return *this;
-      }
+    using RefType = RAJA::internal::expt::StaticTensorRef<
+        POINTER_TYPE,
+        INDEX_TYPE,
+        TENSOR_SIZE,
+        camp::int_seq<INDEX_TYPE, STRIDE_VALUE>,
+        camp::int_seq<INDEX_TYPE, BEGIN_VALUE>,
+        camp::int_seq<INDEX_TYPE, SIZE_VALUE>,
+        STRIDE_ONE_DIM>;
 
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
 
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>>
-      {
+    RAJA_INLINE
+    static void load_ref(self_type& self, RefType const& ref)
+    {
 
-          using RefType = RAJA::internal::expt::TensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, 1, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type& self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by TensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-
-
-
-
-
-      
-      template<typename POINTER_TYPE, typename INDEX_TYPE, RAJA::internal::expt::TensorTileSize TENSOR_SIZE, INDEX_TYPE STRIDE_VALUE, INDEX_TYPE BEGIN_VALUE, INDEX_TYPE SIZE_VALUE, camp::idx_t STRIDE_ONE_DIM>
-      struct RefBridge <RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>> 
-      {
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
 
-          using RefType = RAJA::internal::expt::StaticTensorRef<POINTER_TYPE, INDEX_TYPE, TENSOR_SIZE, camp::int_seq<INDEX_TYPE,STRIDE_VALUE>, camp::int_seq<INDEX_TYPE,BEGIN_VALUE>, camp::int_seq<INDEX_TYPE,SIZE_VALUE>, STRIDE_ONE_DIM>;
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void load_ref (self_type &self, RefType const &ref){
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed ++;
-              #endif
-                self.load_packed(ptr);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_packed_n ++;
-              #endif
-                self.load_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided ++;
-              #endif
-                self.load_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-              #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_load_strided_n ++;
-              #endif
-                self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-
-
-
-          /*!
-           * @brief Performs load specified by StaticTensorRef object.
-           */
-          RAJA_HOST_DEVICE
-          RAJA_INLINE
-          static void store_ref(self_type const &self, RefType &ref) {
-    
-            auto ptr = ref.m_pointer + ref.m_tile.m_begin[0]*ref.m_stride[0];
-    
-            // check for packed data
-            if(STRIDE_ONE_DIM == 0){
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed ++;
-    #endif
-                self.store_packed(ptr);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_packed_n ++;
-    #endif
-                self.store_packed_n(ptr, ref.m_tile.m_size[0]);
-              }
-    
-            }
-            // strided data
-            else
-            {
-              // full vector?
-              if(TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL){
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided ++;
-    #endif
-                self.store_strided(ptr, ref.m_stride[0]);
-              }
-              // partial
-              else{
-    #ifdef RAJA_ENABLE_VECTOR_STATS
-              RAJA::tensor_stats::num_vector_store_strided_n ++;
-    #endif
-                self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
-              }
-            }
-          }
-           
-
-      };
-     
-
-
-
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr)
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed++;
+#endif
+          self.load_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_packed_n++;
+#endif
+          self.load_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, int stride)
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided++;
+#endif
+          self.load_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_load_strided_n++;
+#endif
+          self.load_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, int N)
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].load_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
+    /*!
+     * @brief Performs load specified by StaticTensorRef object.
+     */
+    RAJA_HOST_DEVICE
+
+    RAJA_INLINE
+    static void store_ref(self_type const& self, RefType& ref)
+    {
 
+      auto ptr = ref.m_pointer + ref.m_tile.m_begin[0] * ref.m_stride[0];
+
+      // check for packed data
+      if (STRIDE_ONE_DIM == 0)
+      {
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed++;
+#endif
+          self.store_packed(ptr);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_packed_n++;
+#endif
+          self.store_packed_n(ptr, ref.m_tile.m_size[0]);
         }
-        return *this;
       }
-
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr,
-          int stride, int N)
+      // strided data
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].load_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].load_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
-
+        // full vector?
+        if (TENSOR_SIZE == RAJA::internal::expt::TENSOR_FULL)
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided++;
+#endif
+          self.store_strided(ptr, ref.m_stride[0]);
         }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].load_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+        // partial
+        else
+        {
+#ifdef RAJA_ENABLE_VECTOR_STATS
+          RAJA::tensor_stats::num_vector_store_strided_n++;
+#endif
+          self.store_strided_n(ptr, ref.m_stride[0], ref.m_tile.m_size[0]);
         }
-        return *this;
       }
+    }
+  };
 
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].gather(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
-        }
-        return *this;
-      }
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].gather(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].gather_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-            for(camp::idx_t r = reg+1;r < s_num_full_registers;++ r){
-              m_registers[r].broadcast(0);
-            }
-            return *this;
-          }
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].gather_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_final_register*s_register_num_elem);
-        }
-        return *this;
-      }
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                    stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-      /*!
-       * Loads a dense full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(ptr+s_final_register*s_register_num_elem, s_num_partial_lanes);
-        }
-        return *this;
+        m_registers[reg].load_packed(ptr + reg * s_register_num_elem);
       }
-
-      /*!
-       * Loads a strided full vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, int stride) const
+      else
       {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(ptr+s_final_register*s_register_num_elem*stride, stride, s_num_partial_lanes);
+        m_registers[reg].load_packed_n(ptr + reg * s_register_num_elem,
+                                       N - reg * s_register_num_elem);
+
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a dense partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, int N) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_packed(ptr+reg*s_register_num_elem);
-          }
-          else{
-            m_registers[reg].store_packed_n(ptr+reg*s_register_num_elem,
-                                           N-reg*s_register_num_elem);
-            return *this;
-          }
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_packed_n(
-              ptr+s_final_register*s_register_num_elem,
-              N-s_final_register*s_register_num_elem);
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].load_strided(ptr + reg * s_register_num_elem * stride,
+                                      stride);
+      }
+      else
+      {
+        m_registers[reg].load_strided_n(ptr +
+                                            reg * s_register_num_elem * stride,
+                                        stride, N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].load_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * Loads a strided partial vector from memory
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type  *ptr,
-          int stride, int N) const
-      {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].store_strided(ptr+reg*s_register_num_elem*stride, stride);
-          }
-          else{
-            m_registers[reg].store_strided_n(ptr+reg*s_register_num_elem*stride,
-                                            stride,
-                                            N-reg*s_register_num_elem);
-            return *this;
-          }
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].store_strided_n(
-              ptr+s_final_register*s_register_num_elem*stride,
-              stride,
-              N-s_final_register*s_register_num_elem);
+  RAJA_HOST_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].gather(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             s_num_partial_lanes);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].gather(ptr, offsets.vec(reg));
+      }
+      else
+      {
+        m_registers[reg].gather_n(ptr, offsets.vec(reg),
+                                  N - reg * s_register_num_elem);
+        for (camp::idx_t r = reg + 1; r < s_num_full_registers; ++r)
+        {
+          m_registers[r].broadcast(0);
         }
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].gather_n(ptr, offsets.vec(s_final_register),
+                                             N - s_final_register *
+                                                     s_register_num_elem);
+    }
+    return *this;
+  }
+
+  /*!
+   * Loads a dense full vector from memory
+   */
+  RAJA_HOST_DEVICE
 
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem, s_num_partial_lanes);
+    }
+    return *this;
+  }
 
+  /*!
+   * Loads a strided full vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type const &offsets) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          m_registers[reg].scatter(ptr, offsets.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(ptr, offsets.vec(s_final_register), s_num_partial_lanes);
-        }
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                     stride);
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          s_num_partial_lanes);
+    }
+    return *this;
+  }
+
+  /*!
+   * Loads a dense partial vector from memory
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_packed(ptr + reg * s_register_num_elem);
+      }
+      else
+      {
+        m_registers[reg].store_packed_n(ptr + reg * s_register_num_elem,
+                                        N - reg * s_register_num_elem);
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_packed_n(
+          ptr + s_final_register * s_register_num_elem,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type const &offsets, camp::idx_t N) const {
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            m_registers[reg].scatter(ptr, offsets.vec(reg));
-          }
-          else{
-            m_registers[reg].scatter_n(ptr, offsets.vec(reg), N-reg*s_register_num_elem);
-
-            return *this;
-          }
+  /*!
+   * Loads a strided partial vector from memory
+   */
+  RAJA_HOST_DEVICE
 
-        }
-        if(s_num_partial_lanes){
-          m_registers[s_final_register].scatter_n(
-              ptr,
-              offsets.vec(s_final_register),
-              N-s_num_full_registers*s_register_num_elem);
-        }
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].store_strided(ptr + reg * s_register_num_elem * stride,
+                                       stride);
+      }
+      else
+      {
+        m_registers[reg].store_strided_n(ptr +
+                                             reg * s_register_num_elem * stride,
+                                         stride, N - reg * s_register_num_elem);
         return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].store_strided_n(
+          ptr + s_final_register * s_register_num_elem * stride, stride,
+          N - s_final_register * s_register_num_elem);
+    }
+    return *this;
+  }
 
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &den) const {
-        self_type result;
-        for(camp::idx_t reg = 0;reg < s_num_full_registers;++ reg){
-          result.vec(reg) = m_registers[reg].divide(den.vec(reg));
-        }
-        if(s_num_partial_lanes){
-          result.vec(s_final_register) = m_registers[s_final_register].divide_n(den.vec(s_final_register), s_num_partial_lanes);
-        }
-        return result;
-      }
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr,
+                           int_vector_type const& offsets) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      m_registers[reg].scatter(ptr, offsets.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register), s_num_partial_lanes);
+    }
+    return *this;
+  }
 
-      /*!
-       * @brief Divide n elements of this vector by another vector
-       * @param x Vector to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b.get(i), i);
-        }
-        return q;
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type const& offsets,
+                             camp::idx_t N) const
+  {
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        m_registers[reg].scatter(ptr, offsets.vec(reg));
       }
+      else
+      {
+        m_registers[reg].scatter_n(ptr, offsets.vec(reg),
+                                   N - reg * s_register_num_elem);
 
-      /*!
-       * @brief Divide n elements of this vector by a scalar
-       * @param x Scalar to divide by
-       * @param n Number of elements to divide
-       * @return Value of (*this)+x
-       */
-      RAJA_SUPPRESS_HD_WARN
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(element_type const &b, camp::idx_t n) const {
-        self_type q(*this);
-        for(camp::idx_t i = 0;i < n;++i){
-          q.set(this->get(i) / b, i);
-        }
-        return q;
+        return *this;
       }
+    }
+    if (s_num_partial_lanes)
+    {
+      m_registers[s_final_register].scatter_n(
+          ptr, offsets.vec(s_final_register),
+          N - s_num_full_registers * s_register_num_elem);
+    }
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& den) const
+  {
+    self_type result;
+    for (camp::idx_t reg = 0; reg < s_num_full_registers; ++reg)
+    {
+      result.vec(reg) = m_registers[reg].divide(den.vec(reg));
+    }
+    if (s_num_partial_lanes)
+    {
+      result.vec(s_final_register) = m_registers[s_final_register].divide_n(
+          den.vec(s_final_register), s_num_partial_lanes);
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Divide n elements of this vector by another vector
+   * @param x Vector to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min() const
-      {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].min_n(s_num_partial_lanes);
-        }
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b.get(i), i);
+    }
+    return q;
+  }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::min<element_type>(result, m_registers[i].min());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(s_num_partial_lanes));
-        }
-        return result;
-      }
+  /*!
+   * @brief Divide n elements of this vector by a scalar
+   * @param x Scalar to divide by
+   * @param n Number of elements to divide
+   * @return Value of (*this)+x
+   */
+  RAJA_SUPPRESS_HD_WARN
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Returns the smallest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type min_n(int N) const
-      {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].min_n(N);
-        }
+  RAJA_INLINE
+  self_type divide_n(element_type const& b, camp::idx_t n) const
+  {
+    self_type q(*this);
+    for (camp::idx_t i = 0; i < n; ++i)
+    {
+      q.set(this->get(i) / b, i);
+    }
+    return q;
+  }
 
-        element_type result = m_registers[0].min();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::min<element_type>(result, m_registers[reg].min());
-          }
-          else{
-            return RAJA::min<element_type>(result, m_registers[reg].min_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::min<element_type>(result, m_registers[s_final_register].min_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
-      }
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
 
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max() const
-      {
-        // special case where there's just one parital register
-        if(s_num_full_registers == 0){
-          return m_registers[0].max_n(s_num_partial_lanes);
-        }
+  RAJA_HOST_DEVICE
+  element_type min() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].min_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::min<element_type>(result, m_registers[i].min());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
-        element_type result = m_registers[0].max();
-        for(camp::idx_t i = 1;i < s_num_full_registers;++ i){
-          result = RAJA::max<element_type>(result, m_registers[i].max());
-        }
-        if(s_num_partial_lanes){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(s_num_partial_lanes));
-        }
-        return result;
-      }
+  /*!
+   * @brief Returns the smallest element over the first N lanes
+   */
+  RAJA_INLINE
 
-      /*!
-       * @brief Returns the largest element over the first N lanes
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type max_n(int N) const
+  RAJA_HOST_DEVICE
+  element_type min_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].min_n(N);
+    }
+
+    element_type result = m_registers[0].min();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
       {
-        // special case where there's just one parital register
-        if(N < s_register_num_elem){
-          return m_registers[0].max_n(N);
-        }
-
-        element_type result = m_registers[0].max();
-        for(camp::idx_t reg = 1;reg < s_num_full_registers;++ reg){
-          if(N >= reg*s_register_num_elem + s_register_num_elem){
-            result = RAJA::max<element_type>(result, m_registers[reg].max());
-          }
-          else{
-            return RAJA::max<element_type>(result, m_registers[reg].max_n(N-reg*s_register_num_elem));
-          }
-        }
-        if(N-s_num_full_registers*s_register_num_elem > 0){
-          result = RAJA::max<element_type>(result, m_registers[s_final_register].max_n(N-s_final_register*s_register_num_elem));
-        }
-        return result;
+        result = RAJA::min<element_type>(result, m_registers[reg].min());
       }
-
-      /*!
-       * @brief Returns the sum of all elements
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type sum() const
+      else
       {
-        // first do a vector sum of all registers
-        register_type s = m_registers[0];
-        for(camp::idx_t i = 1;i < s_num_registers;++ i){
-          s += m_registers[i];
-        }
-        // then a horizontal sum of result
-        return s.sum();
+        return RAJA::min<element_type>(
+            result, m_registers[reg].min_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::min<element_type>(
+          result, m_registers[s_final_register].min_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
 
-      /*!
-       * @brief The * operator of two vectors is a element-wise multiply
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type operator*(self_type const &x) const {
-        return this->multiply(x);
-      }
+  RAJA_HOST_DEVICE
+  element_type max() const
+  {
+    // special case where there's just one parital register
+    if (s_num_full_registers == 0)
+    {
+      return m_registers[0].max_n(s_num_partial_lanes);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t i = 1; i < s_num_full_registers; ++i)
+    {
+      result = RAJA::max<element_type>(result, m_registers[i].max());
+    }
+    if (s_num_partial_lanes)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(s_num_partial_lanes));
+    }
+    return result;
+  }
 
+  /*!
+   * @brief Returns the largest element over the first N lanes
+   */
+  RAJA_INLINE
 
-      /*!
-       * @brief The dot product of two vectors
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type dot(self_type const &x) const {
-        element_type dp(0);
-        for(camp::idx_t i = 0;i < s_num_registers;++ i){
-          dp += m_registers[i].dot(x.vec(i));
-        }
-        return dp;
+  RAJA_HOST_DEVICE
+  element_type max_n(int N) const
+  {
+    // special case where there's just one parital register
+    if (N < s_register_num_elem)
+    {
+      return m_registers[0].max_n(N);
+    }
+
+    element_type result = m_registers[0].max();
+    for (camp::idx_t reg = 1; reg < s_num_full_registers; ++reg)
+    {
+      if (N >= reg * s_register_num_elem + s_register_num_elem)
+      {
+        result = RAJA::max<element_type>(result, m_registers[reg].max());
       }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &set(element_type val, int idx){
-        m_registers[to_register(idx)].set(val, to_lane(idx));
-        return *this;
+      else
+      {
+        return RAJA::max<element_type>(
+            result, m_registers[reg].max_n(N - reg * s_register_num_elem));
       }
+    }
+    if (N - s_num_full_registers * s_register_num_elem > 0)
+    {
+      result = RAJA::max<element_type>(
+          result, m_registers[s_final_register].max_n(
+                      N - s_final_register * s_register_num_elem));
+    }
+    return result;
+  }
 
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type get(int idx) const {
-        return m_registers[to_register(idx)].get(to_lane(idx));
-      }
+  /*!
+   * @brief Returns the sum of all elements
+   */
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  element_type sum() const
+  {
+    // first do a vector sum of all registers
+    register_type s = m_registers[0];
+    for (camp::idx_t i = 1; i < s_num_registers; ++i)
+    {
+      s += m_registers[i];
+    }
+    // then a horizontal sum of result
+    return s.sum();
+  }
 
+  /*!
+   * @brief The * operator of two vectors is a element-wise multiply
+   */
+  RAJA_HOST_DEVICE
 
-      /*!
-       * @brief Converts to vector to a string
-       *
-       *
-       */
-      RAJA_INLINE
-      std::string to_string() const {
-        std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+  RAJA_INLINE
+  self_type operator*(self_type const& x) const { return this->multiply(x); }
 
-        //
-        for(camp::idx_t i = 0;i < s_num_elem; ++ i){
-          s += std::to_string(this->get(i)) + " ";
-        }
+  /*!
+   * @brief The dot product of two vectors
+   */
+  RAJA_HOST_DEVICE
 
-        camp::idx_t physical_size = s_num_registers * s_register_num_elem;
-        if(s_num_elem < physical_size){
-          s += "{";
-          for(camp::idx_t i = s_num_elem;i < physical_size; ++ i){
-            s += std::to_string(this->get(i)) + " ";
-          }
-          s += "}";
-        }
+  RAJA_INLINE
+  element_type dot(self_type const& x) const
+  {
+    element_type dp(0);
+    for (camp::idx_t i = 0; i < s_num_registers; ++i)
+    {
+      dp += m_registers[i].dot(x.vec(i));
+    }
+    return dp;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& set(element_type val, int idx)
+  {
+    m_registers[to_register(idx)].set(val, to_lane(idx));
+    return *this;
+  }
 
+  RAJA_HOST_DEVICE
 
-        s += " ]\n";
+  RAJA_INLINE
+  element_type get(int idx) const
+  {
+    return m_registers[to_register(idx)].get(to_lane(idx));
+  }
 
-        return s;
+  /*!
+   * @brief Converts to vector to a string
+   *
+   *
+   */
+  RAJA_INLINE
+  std::string to_string() const
+  {
+    std::string s = "Vector(" + std::to_string(s_num_elem) + ")[ ";
+
+    //
+    for (camp::idx_t i = 0; i < s_num_elem; ++i)
+    {
+      s += std::to_string(this->get(i)) + " ";
+    }
+
+    camp::idx_t physical_size = s_num_registers * s_register_num_elem;
+    if (s_num_elem < physical_size)
+    {
+      s += "{";
+      for (camp::idx_t i = s_num_elem; i < physical_size; ++i)
+      {
+        s += std::to_string(this->get(i)) + " ";
       }
+      s += "}";
+    }
 
 
-  };
+    s += " ]\n";
 
+    return s;
+  }
+};
 
-} // namespace expt
-}  // namespace RAJA
 
+}  // namespace expt
+}  // namespace RAJA
 
 // Bring in the register policy file so we get the default register type
 // and all of the register traits setup
diff --git a/include/RAJA/pattern/tensor/stats.hpp b/include/RAJA/pattern/tensor/stats.hpp
index 77b70faf00..643cd3ca22 100644
--- a/include/RAJA/pattern/tensor/stats.hpp
+++ b/include/RAJA/pattern/tensor/stats.hpp
@@ -33,7 +33,7 @@ namespace expt
 {
 struct tensor_stats
 {
-    static int indent;
+  static int indent;
 
   static camp::idx_t num_vector_copy;
   static camp::idx_t num_vector_copy_ctor;
@@ -77,10 +77,9 @@ struct tensor_stats
 
   static void resetVectorStats();
   static void printVectorStats();
-
 };
 
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/policy/MultiPolicy.hpp b/include/RAJA/policy/MultiPolicy.hpp
index defa08585a..d268f2962f 100644
--- a/include/RAJA/policy/MultiPolicy.hpp
+++ b/include/RAJA/policy/MultiPolicy.hpp
@@ -30,13 +30,12 @@
 #include "RAJA/util/concepts.hpp"
 #include "RAJA/util/resource.hpp"
 
-
 namespace RAJA
 {
 
 namespace detail
 {
-template <size_t index, size_t size, typename Policy, typename... rest>
+template<size_t index, size_t size, typename Policy, typename... rest>
 struct policy_invoker;
 }
 
@@ -50,22 +49,23 @@ namespace multi
 ///
 /// \tparam Selector Functor/Lambda/function type used to select policies
 /// \tparam Policies Variadic pack of policies, numbered from 0
-template <typename Selector, typename... Policies>
+template<typename Selector, typename... Policies>
 class MultiPolicy
 {
   Selector s;
 
 public:
   MultiPolicy() = delete;  // No default construction
-  MultiPolicy(Selector s) : s(s), _policies({Policies{}...}) {}
+
+  MultiPolicy(Selector s) : s(s), _policies({Policies {}...}) {}
+
   MultiPolicy(Selector s, Policies... policies) : s(s), _policies({policies...})
-  {
-  }
+  {}
 
-  MultiPolicy(const MultiPolicy &p) : s(p.s), _policies(p._policies) {}
+  MultiPolicy(const MultiPolicy& p) : s(p.s), _policies(p._policies) {}
 
-  template <typename Iterable, typename Body>
-  int invoke(Iterable &&i, Body &&b)
+  template<typename Iterable, typename Body>
+  int invoke(Iterable&& i, Body&& b)
   {
     size_t index = s(i);
     _policies.invoke(index, i, b);
@@ -82,25 +82,27 @@ class MultiPolicy
 /// \param p MultiPolicy to use for selection
 /// \param iter iterable of items to supply to body
 /// \param body functor, will receive each value produced by iterable iter
-template <typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
+template<typename Iterable,
+         typename Body,
+         typename Selector,
+         typename... Policies>
 RAJA_INLINE void forall_impl(MultiPolicy<Selector, Policies...> p,
-                             Iterable &&iter,
-                             Body &&body)
+                             Iterable&& iter,
+                             Body&& body)
 {
   p.invoke(iter, body);
 }
-template <typename Res,
-          typename Iterable,
-          typename Body,
-          typename Selector,
-          typename... Policies>
-RAJA_INLINE resources::EventProxy<Res> forall_impl(Res r,
-                                  MultiPolicy<Selector, Policies...> p,
-                                  Iterable &&iter,
-                                  Body &&body)
+
+template<typename Res,
+         typename Iterable,
+         typename Body,
+         typename Selector,
+         typename... Policies>
+RAJA_INLINE resources::EventProxy<Res> forall_impl(
+    Res r,
+    MultiPolicy<Selector, Policies...> p,
+    Iterable&& iter,
+    Body&& body)
 {
   p.invoke(iter, body);
   return resources::EventProxy<Res>(r);
@@ -114,7 +116,7 @@ using policy::multi::MultiPolicy;
 namespace detail
 {
 
-template <camp::idx_t... Indices, typename... Policies, typename Selector>
+template<camp::idx_t... Indices, typename... Policies, typename Selector>
 auto make_multi_policy(camp::idx_seq<Indices...>,
                        Selector s,
                        std::tuple<Policies...> policies)
@@ -132,11 +134,11 @@ auto make_multi_policy(camp::idx_seq<Indices...>,
 /// \param s functor called with the segment object passed to
 /// forall, must return an int in the set 0 to N-1 selecting the policy to use
 /// \return A MultiPolicy containing the given selector s
-template <typename... Policies, typename Selector>
+template<typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 {
-  return MultiPolicy<Selector, Policies...>(s, Policies{}...);
+  return MultiPolicy<Selector, Policies...>(s, Policies {}...);
 }
 
 /// make_multi_policy - Construct a MultiPolicy from the given selector and
@@ -148,32 +150,34 @@ auto make_multi_policy(Selector s) -> MultiPolicy<Selector, Policies...>
 /// \param s functor called with the segment object passed to
 /// forall, must return an int in the set 0 to N-1 selecting the policy to use
 /// \return A MultiPolicy containing the given selector s
-template <typename... Policies, typename Selector>
+template<typename... Policies, typename Selector>
 RAJA_DEPRECATE("In the next RAJA Release, MultiPolicy will be deprecated.")
 auto make_multi_policy(std::tuple<Policies...> policies, Selector s)
     -> MultiPolicy<Selector, Policies...>
 {
-  return detail::make_multi_policy(
-      camp::make_idx_seq_t<sizeof...(Policies)>{}, s, policies);
+  return detail::make_multi_policy(camp::make_idx_seq_t<sizeof...(Policies)> {},
+                                   s, policies);
 }
 
 namespace detail
 {
 
-template <size_t index, size_t size, typename Policy, typename... rest>
-struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
+template<size_t index, size_t size, typename Policy, typename... rest>
+struct policy_invoker : public policy_invoker<index - 1, size, rest...>
+{
   static_assert(index < size, "index must be in the range of possibilities");
   Policy _p;
   using NextInvoker = policy_invoker<index - 1, size, rest...>;
 
   policy_invoker(Policy p, rest... args) : NextInvoker(args...), _p(p) {}
 
-  template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+  template<typename Iterable, typename LoopBody>
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - index - 1) {
+    if (offset == size - index - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -189,22 +193,29 @@ struct policy_invoker : public policy_invoker<index - 1, size, rest...> {
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
-      NextInvoker::invoke(offset, std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
+    }
+    else
+    {
+      NextInvoker::invoke(offset, std::forward<Iterable>(iter),
+                          std::forward<LoopBody>(loop_body));
     }
   }
 };
 
-template <size_t size, typename Policy, typename... rest>
-struct policy_invoker<0, size, Policy, rest...> {
+template<size_t size, typename Policy, typename... rest>
+struct policy_invoker<0, size, Policy, rest...>
+{
   Policy _p;
+
   policy_invoker(Policy p, rest...) : _p(p) {}
-  template <typename Iterable, typename LoopBody>
-  void invoke(int offset, Iterable &&iter, LoopBody &&loop_body)
+
+  template<typename Iterable, typename LoopBody>
+  void invoke(int offset, Iterable&& iter, LoopBody&& loop_body)
   {
-    if (offset == size - 1) {
+    if (offset == size - 1)
+    {
 
-      util::PluginContext context{util::make_context<Policy>()};
+      util::PluginContext context {util::make_context<Policy>()};
       util::callPreCapturePlugins(context);
 
       using RAJA::util::trigger_updates_before;
@@ -214,14 +225,16 @@ struct policy_invoker<0, size, Policy, rest...> {
 
       util::callPreLaunchPlugins(context);
 
-      //std::cout <<"policy_invoker: No index\n";
+      // std::cout <<"policy_invoker: No index\n";
       using policy::multi::forall_impl;
       RAJA_FORCEINLINE_RECURSIVE
       auto r = resources::get_resource<Policy>::type::get_default();
       forall_impl(r, _p, std::forward<Iterable>(iter), body);
 
       util::callPostLaunchPlugins(context);
-    } else {
+    }
+    else
+    {
       throw std::runtime_error("unknown offset invoked");
     }
   }
@@ -232,10 +245,11 @@ struct policy_invoker<0, size, Policy, rest...> {
 namespace type_traits
 {
 
-template <typename T>
+template<typename T>
 struct is_multi_policy
-    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy, typename std::decay<T>::type> {
-};
+    : ::RAJA::type_traits::SpecializationOf<RAJA::MultiPolicy,
+                                            typename std::decay<T>::type>
+{};
 }  // namespace type_traits
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/PolicyBase.hpp b/include/RAJA/policy/PolicyBase.hpp
index 898c92a621..5637aa6698 100644
--- a/include/RAJA/policy/PolicyBase.hpp
+++ b/include/RAJA/policy/PolicyBase.hpp
@@ -26,7 +26,8 @@
 namespace RAJA
 {
 
-enum class Policy {
+enum class Policy
+{
   undefined,
   sequential,
   simd,
@@ -37,7 +38,8 @@ enum class Policy {
   sycl
 };
 
-enum class Pattern {
+enum class Pattern
+{
   undefined,
   forall,
   region,
@@ -52,131 +54,137 @@ enum class Pattern {
   workgroup_dispatch
 };
 
-enum class Launch { undefined, sync, async };
-
-struct PolicyBase {
+enum class Launch
+{
+  undefined,
+  sync,
+  async
 };
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          Platform Platform_,
-          typename... Traits>
-struct PolicyBaseT : PolicyBase {
-  static constexpr Policy policy = Policy_;
-  static constexpr Pattern pattern = Pattern_;
-  static constexpr Launch launch = Launch_;
+struct PolicyBase
+{};
+
+template<Policy Policy_,
+         Pattern Pattern_,
+         Launch Launch_,
+         Platform Platform_,
+         typename... Traits>
+struct PolicyBaseT : PolicyBase
+{
+  static constexpr Policy policy     = Policy_;
+  static constexpr Pattern pattern   = Pattern_;
+  static constexpr Launch launch     = Launch_;
   static constexpr Platform platform = Platform_;
 };
 
-template <typename PolicyType>
-struct policy_of {
+template<typename PolicyType>
+struct policy_of
+{
   static constexpr Policy value = PolicyType::policy;
 };
 
-template <typename PolicyType>
-struct pattern_of {
+template<typename PolicyType>
+struct pattern_of
+{
   static constexpr Pattern value = PolicyType::pattern;
 };
 
-template <typename PolicyType>
-struct launch_of {
+template<typename PolicyType>
+struct launch_of
+{
   static constexpr Launch value = PolicyType::launch;
 };
 
-template <typename PolicyType>
-struct platform_of {
+template<typename PolicyType>
+struct platform_of
+{
   static constexpr Platform value = PolicyType::platform;
 };
 
-template <typename PolicyType, RAJA::Policy P_>
-struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_> {
-};
+template<typename PolicyType, RAJA::Policy P_>
+struct policy_is : camp::num<policy_of<camp::decay<PolicyType>>::value == P_>
+{};
 
-template <typename PolicyType, RAJA::Policy ... Ps_>
-struct policy_any_of : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value> {
-};
+template<typename PolicyType, RAJA::Policy... Ps_>
+struct policy_any_of
+    : camp::num<camp::concepts::any_of<policy_is<PolicyType, Ps_>...>::value>
+{};
 
-template <typename PolicyType, RAJA::Pattern P_>
-struct pattern_is
-    : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_> {
-};
+template<typename PolicyType, RAJA::Pattern P_>
+struct pattern_is : camp::num<pattern_of<camp::decay<PolicyType>>::value == P_>
+{};
 
-template <typename PolicyType, RAJA::Launch L_>
-struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_> {
-};
+template<typename PolicyType, RAJA::Launch L_>
+struct launch_is : camp::num<launch_of<camp::decay<PolicyType>>::value == L_>
+{};
 
-template <typename PolicyType, RAJA::Platform P_>
+template<typename PolicyType, RAJA::Platform P_>
 struct platform_is
-    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_> {
-};
+    : camp::num<platform_of<camp::decay<PolicyType>>::value == P_>
+{};
+
+template<typename PolicyType, typename Trait>
+struct policy_has_trait_impl : camp::num<false>
+{};
 
-template <typename PolicyType, typename Trait>
-struct policy_has_trait_impl
-    : camp::num<false> {
-};
 ///
-template <typename Trait, Policy Policy_,
-                          Pattern Pattern_,
-                          Launch Launch_,
-                          Platform Platform_,
-                          typename... Traits>
+template<typename Trait,
+         Policy Policy_,
+         Pattern Pattern_,
+         Launch Launch_,
+         Platform Platform_,
+         typename... Traits>
 struct policy_has_trait_impl<
-      PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>, Trait>
-    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value> {
-};
+    PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Traits...>,
+    Trait>
+    : camp::num<camp::concepts::any_of<std::is_same<Trait, Traits>...>::value>
+{};
+
 ///
-template <typename PolicyType, typename Trait>
+template<typename PolicyType, typename Trait>
 using policy_has_trait = policy_has_trait_impl<camp::decay<PolicyType>, Trait>;
 
-
-template <typename Inner>
-struct wrapper {
+template<typename Inner>
+struct wrapper
+{
   using inner = Inner;
 };
 
 namespace reduce
 {
 
-struct ordered {
-};
+struct ordered
+{};
 
-struct unordered {
-};
+struct unordered
+{};
 
 }  // namespace reduce
 
-
-template <Policy Pol, Pattern Pat, typename... Args>
+template<Policy Pol, Pattern Pat, typename... Args>
 using make_policy_pattern_t =
     PolicyBaseT<Pol, Pat, Launch::undefined, Platform::undefined, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          Platform Platform_,
-          typename... Args>
+template<Policy Policy_,
+         Pattern Pattern_,
+         Launch Launch_,
+         Platform Platform_,
+         typename... Args>
 using make_policy_pattern_launch_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform_, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Launch Launch_,
-          typename... Args>
+template<Policy Policy_, Pattern Pattern_, Launch Launch_, typename... Args>
 using make_policy_pattern_launch_t =
     PolicyBaseT<Policy_, Pattern_, Launch_, Platform::undefined, Args...>;
 
-template <Policy Policy_,
-          Pattern Pattern_,
-          Platform Platform_,
-          typename... Args>
+template<Policy Policy_, Pattern Pattern_, Platform Platform_, typename... Args>
 using make_policy_pattern_platform_t =
     PolicyBaseT<Policy_, Pattern_, Launch::undefined, Platform_, Args...>;
 
 namespace concepts
 {
 
-template <typename Pol>
+template<typename Pol>
 struct ExecutionPolicy
     : DefineConcept(::RAJA::concepts::has_type<::RAJA::Policy>(
                         camp::decay<decltype(Pol::policy)>()),
@@ -185,53 +193,59 @@ struct ExecutionPolicy
                     ::RAJA::concepts::has_type<::RAJA::Launch>(
                         camp::decay<decltype(Pol::launch)>()),
                     ::RAJA::concepts::has_type<::RAJA::Platform>(
-                        camp::decay<decltype(Pol::platform)>())) {
-};
+                        camp::decay<decltype(Pol::platform)>()))
+{};
 
 }  // end namespace concepts
 
 namespace type_traits
 {
 
-template <typename Pol>
-struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential> {
-};
-template <typename Pol>
-struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd> {
-};
-template <typename Pol>
-struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp> {
-};
-template <typename Pol>
+template<typename Pol>
+struct is_sequential_policy : RAJA::policy_is<Pol, RAJA::Policy::sequential>
+{};
+
+template<typename Pol>
+struct is_simd_policy : RAJA::policy_is<Pol, RAJA::Policy::simd>
+{};
+
+template<typename Pol>
+struct is_openmp_policy : RAJA::policy_is<Pol, RAJA::Policy::openmp>
+{};
+
+template<typename Pol>
 struct is_target_openmp_policy
-    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp> {
-};
-template <typename Pol>
-struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda> {
-};
-template <typename Pol>
-struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip> {
-};
-template <typename Pol>
-struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl> {
-};
+    : RAJA::policy_is<Pol, RAJA::Policy::target_openmp>
+{};
+
+template<typename Pol>
+struct is_cuda_policy : RAJA::policy_is<Pol, RAJA::Policy::cuda>
+{};
+
+template<typename Pol>
+struct is_hip_policy : RAJA::policy_is<Pol, RAJA::Policy::hip>
+{};
 
-template <typename Pol>
+template<typename Pol>
+struct is_sycl_policy : RAJA::policy_is<Pol, RAJA::Policy::sycl>
+{};
+
+template<typename Pol>
 struct is_device_exec_policy
-    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip> {
-};
+    : RAJA::policy_any_of<Pol, RAJA::Policy::cuda, RAJA::Policy::hip>
+{};
 
 DefineTypeTraitFromConcept(is_execution_policy,
                            RAJA::concepts::ExecutionPolicy);
 
+template<typename Pol>
+struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce>
+{};
 
-template <typename Pol>
-struct is_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::reduce> {
-};
-
-template <typename Pol>
-struct is_multi_reduce_policy : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce> {
-};
+template<typename Pol>
+struct is_multi_reduce_policy
+    : RAJA::pattern_is<Pol, RAJA::Pattern::multi_reduce>
+{};
 
 }  // end namespace type_traits
 
diff --git a/include/RAJA/policy/WorkGroup.hpp b/include/RAJA/policy/WorkGroup.hpp
index cae78d2493..0b341dd01e 100644
--- a/include/RAJA/policy/WorkGroup.hpp
+++ b/include/RAJA/policy/WorkGroup.hpp
@@ -39,74 +39,80 @@ namespace workgroup
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
+
 /// execute the enqueued loops in the reverse order from the order that they
 /// were enqueued
 /// Note this is intended for debugging, the WorkGroup abstraction is intended
 /// to allow running loops in an unordered fashion (loop fusion)
 struct reverse_ordered
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_order> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_order>
+{};
 
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in separate allocations.
 struct array_of_pointers
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
+
 /// store an array of pointers to the enqueued objects. The enqueued objects
 /// are stored in a single compact array.
 struct ragged_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
+
 /// store an array of the enqueued objects with padding such that the objects
 /// can be accessed using a constant stride from the beginning of the array.
 struct constant_stride_array_of_objects
-    : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_storage> {
-};
+    : RAJA::make_policy_pattern_t<Policy::undefined, Pattern::workgroup_storage>
+{};
 
 /// Dispatch using function pointers to make indirect function calls
 struct indirect_function_call_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
+
 /// Dispatch using virtual functions to make indirect function calls
 struct indirect_virtual_function_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
+                                  Pattern::workgroup_dispatch>
+{};
+
 /// Dispatch using an implementation equivalent to a switch statement to select
 /// the type from RangeAndCallables and directly call the object.
 /// RangeAndCallables is a pack of types of the form camp::list<Range, Callable>
 /// where pairs of Range and Callable are the types of the range and callable
 /// objects that may be passed to WorkPool enqueue.
-template < typename ... RangeAndCallables >
+template<typename... RangeAndCallables>
 struct direct_dispatch
     : RAJA::make_policy_pattern_t<Policy::undefined,
-                                  Pattern::workgroup_dispatch> {
-};
-
-template < typename EXEC_POLICY_T,
-           typename ORDER_POLICY_T,
-           typename STORAGE_POLICY_T,
-           typename DISPATCH_POLICY_T = indirect_function_call_dispatch >
-struct WorkGroupPolicy
-    : public RAJA::make_policy_pattern_platform_t<
-                       policy_of<EXEC_POLICY_T>::value,
-                       Pattern::workgroup,
-                       platform_of<EXEC_POLICY_T>::value> {
-  static_assert(RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
+                                  Pattern::workgroup_dispatch>
+{};
+
+template<typename EXEC_POLICY_T,
+         typename ORDER_POLICY_T,
+         typename STORAGE_POLICY_T,
+         typename DISPATCH_POLICY_T = indirect_function_call_dispatch>
+struct WorkGroupPolicy : public RAJA::make_policy_pattern_platform_t<
+                             policy_of<EXEC_POLICY_T>::value,
+                             Pattern::workgroup,
+                             platform_of<EXEC_POLICY_T>::value>
+{
+  static_assert(
+      RAJA::pattern_is<EXEC_POLICY_T, RAJA::Pattern::workgroup_exec>::value,
       "WorkGroupPolicy: EXEC_POLICY_T must be a workgroup exec policy");
-  static_assert(RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
+  static_assert(
+      RAJA::pattern_is<ORDER_POLICY_T, RAJA::Pattern::workgroup_order>::value,
       "WorkGroupPolicy: ORDER_POLICY_T must be a workgroup order policy");
-  static_assert(RAJA::pattern_is<STORAGE_POLICY_T, RAJA::Pattern::workgroup_storage>::value,
+  static_assert(
+      RAJA::pattern_is<STORAGE_POLICY_T,
+                       RAJA::Pattern::workgroup_storage>::value,
       "WorkGroupPolicy: STORAGE_POLICY_T must be a workgroup storage policy");
-  static_assert(RAJA::pattern_is<DISPATCH_POLICY_T, RAJA::Pattern::workgroup_dispatch>::value,
+  static_assert(
+      RAJA::pattern_is<DISPATCH_POLICY_T,
+                       RAJA::Pattern::workgroup_dispatch>::value,
       "WorkGroupPolicy: DISPATCH_POLICY_T must be a workgroup dispatch policy");
 };
 
@@ -117,12 +123,12 @@ using policy::workgroup::ordered;
 using policy::workgroup::reverse_ordered;
 
 using policy::workgroup::array_of_pointers;
-using policy::workgroup::ragged_array_of_objects;
 using policy::workgroup::constant_stride_array_of_objects;
+using policy::workgroup::ragged_array_of_objects;
 
+using policy::workgroup::direct_dispatch;
 using policy::workgroup::indirect_function_call_dispatch;
 using policy::workgroup::indirect_virtual_function_dispatch;
-using policy::workgroup::direct_dispatch;
 
 using policy::workgroup::WorkGroupPolicy;
 
diff --git a/include/RAJA/policy/atomic_auto.hpp b/include/RAJA/policy/atomic_auto.hpp
index e0ca557b32..45b4548c0d 100644
--- a/include/RAJA/policy/atomic_auto.hpp
+++ b/include/RAJA/policy/atomic_auto.hpp
@@ -23,7 +23,7 @@
 #include "RAJA/util/macros.hpp"
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 /*!
@@ -39,19 +39,19 @@
  * because we assume there is no thread safety issues (no parallel model)
  */
 #if defined(__CUDA_ARCH__) && defined(RAJA_CUDA_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::cuda_atomic {}
 #elif defined(__HIP_DEVICE_COMPILE__) && defined(RAJA_HIP_ACTIVE)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::hip_atomic {}
 #elif defined(__SYCL_DEVICE_ONLY__)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::sycl_atomic {}
 #elif defined(RAJA_ENABLE_OPENMP)
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::omp_atomic {}
 #else
-#define RAJA_AUTO_ATOMIC \
+#define RAJA_AUTO_ATOMIC                                                       \
   RAJA::seq_atomic {}
 #endif
 
@@ -60,102 +60,96 @@ namespace RAJA
 {
 
 //! Atomic policy that automatically does "the right thing"
-struct auto_atomic {
-};
+struct auto_atomic
+{};
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T *acc)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(auto_atomic, T* acc)
 {
   return atomicLoad(RAJA_AUTO_ATOMIC, acc);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(auto_atomic, T* acc, T value)
 {
   atomicStore(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(auto_atomic, T* acc, T value)
 {
   return atomicAdd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(auto_atomic, T* acc, T value)
 {
   return atomicSub(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(auto_atomic, T* acc, T value)
 {
   return atomicMin(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(auto_atomic, T* acc, T value)
 {
   return atomicMax(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T *acc)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic,
-                                         T *acc,
-                                         T compare)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(auto_atomic, T* acc, T compare)
 {
   return atomicInc(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T *acc)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic,
-                                         T *acc,
-                                         T compare)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(auto_atomic, T* acc, T compare)
 {
   return atomicDec(RAJA_AUTO_ATOMIC, acc, compare);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(auto_atomic, T* acc, T value)
 {
   return atomicAnd(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(auto_atomic, T* acc, T value)
 {
   return atomicOr(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T *acc, T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(auto_atomic, T* acc, T value)
 {
   return atomicXor(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
-RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic,
-                                              T *acc,
-                                              T value)
+template<typename T>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(auto_atomic, T* acc, T value)
 {
   return atomicExchange(RAJA_AUTO_ATOMIC, acc, value);
 }
 
-template <typename T>
+template<typename T>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(auto_atomic, T *acc, T compare, T value)
+atomicCAS(auto_atomic, T* acc, T compare, T value)
 {
   return atomicCAS(RAJA_AUTO_ATOMIC, acc, compare, value);
 }
diff --git a/include/RAJA/policy/atomic_builtin.hpp b/include/RAJA/policy/atomic_builtin.hpp
index e43bd71386..ed2bdf0978 100644
--- a/include/RAJA/policy/atomic_builtin.hpp
+++ b/include/RAJA/policy/atomic_builtin.hpp
@@ -22,7 +22,8 @@
 
 #include <cstdint>
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 #include <intrin.h>
 #endif
 
@@ -41,62 +42,60 @@ namespace RAJA
 
 
 //! Atomic policy that uses the compilers builtin __atomic_XXX routines
-struct builtin_atomic {
-};
-
+struct builtin_atomic
+{};
 
-namespace detail {
+namespace detail
+{
 
 
-#if defined(RAJA_COMPILER_MSVC) || ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
+#if defined(RAJA_COMPILER_MSVC) ||                                             \
+    ((defined(_WIN32) || defined(_WIN64)) && defined(__INTEL_COMPILER))
 
 
 /*!
  * Type trait for determining if the operator should be implemented
  * using an intrinsic
  */
-template <typename T>
-struct builtin_useIntrinsic {
+template<typename T>
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    std::is_same<T, char>::value ||
-    std::is_same<T, short>::value ||
-    std::is_same<T, long>::value ||
-    std::is_same<T, long long>::value;
+      std::is_same<T, char>::value || std::is_same<T, short>::value ||
+      std::is_same<T, long>::value || std::is_same<T, long long>::value;
 };
 
-
 /*!
  * Type trait for determining if the operator should be implemented
  * by reinterpreting inputs to types that intrinsics support
  */
-template <typename T>
-struct builtin_useReinterpret {
+template<typename T>
+struct builtin_useReinterpret
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 ||
-     sizeof(T) == 2 ||
-     sizeof(T) == 4 ||
-     sizeof(T) == 8);
-
-  using type =
-    std::conditional_t<sizeof(T) == 1, char,
-    std::conditional_t<sizeof(T) == 2, short,
-    std::conditional_t<sizeof(T) == 4, long, long long>>>;
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+
+  using type = std::conditional_t<
+      sizeof(T) == 1,
+      char,
+      std::conditional_t<sizeof(T) == 2,
+                         short,
+                         std::conditional_t<sizeof(T) == 4, long, long long>>>;
 };
 
-
 /*!
  * Type trait for determining if the operator should be implemented
  * using a compare and swap loop
  */
-template <typename T>
-struct builtin_useCAS {
+template<typename T>
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !builtin_useIntrinsic<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !builtin_useIntrinsic<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
-
 /*!
  * Atomics implemented using intrinsics
  */
@@ -105,24 +104,24 @@ struct builtin_useCAS {
 /*!
  * Atomic or using intrinsics
  */
-RAJA_INLINE char builtin_atomicOr(char *acc, char value)
+RAJA_INLINE char builtin_atomicOr(char* acc, char value)
 {
   return _InterlockedOr8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicOr(short *acc, short value)
+RAJA_INLINE short builtin_atomicOr(short* acc, short value)
 {
   return _InterlockedOr16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicOr(long *acc, long value)
+RAJA_INLINE long builtin_atomicOr(long* acc, long value)
 {
   return _InterlockedOr(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicOr(long long* acc, long long value)
 {
   return _InterlockedOr64(acc, value);
 }
@@ -132,35 +131,34 @@ RAJA_INLINE long long builtin_atomicOr(long long *acc, long long value)
 /*!
  * Atomic load using atomic or
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE T builtin_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return builtin_atomicOr(acc, static_cast<T>(0));
 }
 
-
 /*!
  * Atomic exchange using intrinsics
  */
-RAJA_INLINE char builtin_atomicExchange(char *acc, char value)
+RAJA_INLINE char builtin_atomicExchange(char* acc, char value)
 {
   return _InterlockedExchange8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicExchange(short *acc, short value)
+RAJA_INLINE short builtin_atomicExchange(short* acc, short value)
 {
   return _InterlockedExchange16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicExchange(long *acc, long value)
+RAJA_INLINE long builtin_atomicExchange(long* acc, long value)
 {
   return _InterlockedExchange(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicExchange(long long* acc, long long value)
 {
   return _InterlockedExchange64(acc, value);
 }
@@ -171,35 +169,36 @@ RAJA_INLINE long long builtin_atomicExchange(long long *acc, long long value)
 /*!
  * Atomic store using atomic exchange
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   builtin_atomicExchange(acc, value);
 }
 
-
 /*!
  * Atomic compare and swap using intrinsics
  */
-RAJA_INLINE char builtin_atomicCAS(char *acc, char compare, char value)
+RAJA_INLINE char builtin_atomicCAS(char* acc, char compare, char value)
 {
   return _InterlockedCompareExchange8(acc, value, compare);
 }
 
-RAJA_INLINE short builtin_atomicCAS(short *acc, short compare, short value)
+RAJA_INLINE short builtin_atomicCAS(short* acc, short compare, short value)
 {
   return _InterlockedCompareExchange16(acc, value, compare);
 }
 
-RAJA_INLINE long builtin_atomicCAS(long *acc, long compare, long value)
+RAJA_INLINE long builtin_atomicCAS(long* acc, long compare, long value)
 {
   return _InterlockedCompareExchange(acc, value, compare);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long long value)
+RAJA_INLINE long long builtin_atomicCAS(long long* acc,
+                                        long long compare,
+                                        long long value)
 {
   return _InterlockedCompareExchange64(acc, value, compare);
 }
@@ -210,24 +209,24 @@ RAJA_INLINE long long builtin_atomicCAS(long long *acc, long long compare, long
 /*!
  * Atomic addition using intrinsics
  */
-RAJA_INLINE char builtin_atomicAdd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAdd(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAdd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAdd(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAdd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAdd(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAdd(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, value);
 }
@@ -238,24 +237,24 @@ RAJA_INLINE long long builtin_atomicAdd(long long *acc, long long value)
 /*!
  * Atomic subtraction using intrinsics
  */
-RAJA_INLINE char builtin_atomicSub(char *acc, char value)
+RAJA_INLINE char builtin_atomicSub(char* acc, char value)
 {
   return _InterlockedExchangeAdd8(acc, -value);
 }
 
-RAJA_INLINE short builtin_atomicSub(short *acc, short value)
+RAJA_INLINE short builtin_atomicSub(short* acc, short value)
 {
   return _InterlockedExchangeAdd16(acc, -value);
 }
 
-RAJA_INLINE long builtin_atomicSub(long *acc, long value)
+RAJA_INLINE long builtin_atomicSub(long* acc, long value)
 {
   return _InterlockedExchangeAdd(acc, -value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicSub(long long* acc, long long value)
 {
   return _InterlockedExchangeAdd64(acc, -value);
 }
@@ -266,24 +265,24 @@ RAJA_INLINE long long builtin_atomicSub(long long *acc, long long value)
 /*!
  * Atomic and using intrinsics
  */
-RAJA_INLINE char builtin_atomicAnd(char *acc, char value)
+RAJA_INLINE char builtin_atomicAnd(char* acc, char value)
 {
   return _InterlockedAnd8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicAnd(short *acc, short value)
+RAJA_INLINE short builtin_atomicAnd(short* acc, short value)
 {
   return _InterlockedAnd16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicAnd(long *acc, long value)
+RAJA_INLINE long builtin_atomicAnd(long* acc, long value)
 {
   return _InterlockedAnd(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicAnd(long long* acc, long long value)
 {
   return _InterlockedAnd64(acc, value);
 }
@@ -294,24 +293,24 @@ RAJA_INLINE long long builtin_atomicAnd(long long *acc, long long value)
 /*!
  * Atomic xor using intrinsics
  */
-RAJA_INLINE char builtin_atomicXor(char *acc, char value)
+RAJA_INLINE char builtin_atomicXor(char* acc, char value)
 {
   return _InterlockedXor8(acc, value);
 }
 
-RAJA_INLINE short builtin_atomicXor(short *acc, short value)
+RAJA_INLINE short builtin_atomicXor(short* acc, short value)
 {
   return _InterlockedXor16(acc, value);
 }
 
-RAJA_INLINE long builtin_atomicXor(long *acc, long value)
+RAJA_INLINE long builtin_atomicXor(long* acc, long value)
 {
   return _InterlockedXor(acc, value);
 }
 
 #if defined(_WIN64)
 
-RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
+RAJA_INLINE long long builtin_atomicXor(long long* acc, long long value)
 {
   return _InterlockedXor64(acc, value);
 }
@@ -326,83 +325,82 @@ RAJA_INLINE long long builtin_atomicXor(long long *acc, long long value)
  * Type trait for determining if the operator should be implemented
  * using an intrinsic
  */
-template <typename T>
-struct builtin_useIntrinsic {
+template<typename T>
+struct builtin_useIntrinsic
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
-
 /*!
  * Type trait for determining if the operator should be implemented
  * by reinterpreting inputs to types that intrinsics support
  */
-template <typename T>
-struct builtin_useReinterpret {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+template<typename T>
+struct builtin_useReinterpret
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
-
 /*!
  * Type trait for determining if the operator should be implemented
  * using a compare and swap loop
  */
-template <typename T>
-struct builtin_useCAS {
+template<typename T>
+struct builtin_useCAS
+{
   static constexpr bool value =
-    !std::is_integral<T>::value && !std::is_enum<T>::value &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      !std::is_integral<T>::value && !std::is_enum<T>::value &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
-
 /*!
  * Atomics implemented using intrinsics
  */
@@ -411,99 +409,91 @@ struct builtin_useCAS {
 /*!
  * Atomic load using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   return __atomic_load_n(acc, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic store using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   __atomic_store_n(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic exchange using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   return __atomic_exchange_n(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic compare and swap using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
-  __atomic_compare_exchange_n(
-      acc, &compare, value, false, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+  __atomic_compare_exchange_n(acc, &compare, value, false, __ATOMIC_RELAXED,
+                              __ATOMIC_RELAXED);
   return compare;
 }
 
-
 /*!
  * Atomic addition using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
   return __atomic_fetch_add(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic subtraction using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
   return __atomic_fetch_sub(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic and using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
   return __atomic_fetch_and(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic or using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
   return __atomic_fetch_or(acc, value, __ATOMIC_RELAXED);
 }
 
-
 /*!
  * Atomic xor using intrinsic
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
   return __atomic_fetch_xor(acc, value, __ATOMIC_RELAXED);
 }
@@ -520,30 +510,28 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using builtin_useReinterpret_t = typename builtin_useReinterpret<T>::type;
 
-
 /*!
  * Atomic load using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicLoad(T* acc)
 {
   using R = builtin_useReinterpret_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicLoad(reinterpret_cast<R*>(acc)));
+      builtin_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
-
 /*!
  * Atomic store using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -551,38 +539,33 @@ RAJA_DEVICE_HIP RAJA_INLINE void builtin_atomicStore(T *acc, T value)
                       RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
-
 /*!
  * Atomic exchange using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicExchange(T* acc, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicExchange(reinterpret_cast<R*>(acc),
-                           RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
-
 /*!
  * Atomic compare and swap using reinterpret cast
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T* acc, T compare, T value)
 {
   using R = builtin_useReinterpret_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    builtin_atomicCAS(reinterpret_cast<R*>(acc),
-                      RAJA::util::reinterp_A_as_B<T, R>(compare),
-                      RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(builtin_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
-
 /*!
  * Implementation of compare and swap loop
  */
@@ -592,22 +575,21 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS(T *acc, T compare, T value)
  * Equality comparison for compare and swap loop using types supported by
  * intrinsics.
  */
-template <typename T,
-          std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+template<typename T,
+         std::enable_if_t<builtin_useIntrinsic<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-
 /*!
  * Equality comparison for compare and swap loop using reinterpret cast.
  * Converts to the underlying integral type to avoid cases where the values
  * will never compare equal (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
+template<typename T,
+         std::enable_if_t<builtin_useReinterpret<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T& a, const T& b)
 {
   using R = builtin_useReinterpret_t<T>;
 
@@ -615,56 +597,55 @@ RAJA_DEVICE_HIP RAJA_INLINE bool builtin_atomicCAS_equal(const T &a, const T &b)
                                  RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
-
 /*!
  * Generic impementation of any atomic 8, 16, 32, or 64 bit operator
  * that can be implemented using a builtin compare and swap primitive.
  * Returns the OLD value that was replaced by the result of this operation.
  */
-template <typename T, typename Oper>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper)
+template<typename T, typename Oper>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = builtin_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected));
 
   return old;
 }
 
-
 /*!
  * Generic impementation of any atomic 8, 16, 32, or 64 bit operator
  * that can be implemented using a builtin compare and swap primitive.
  * Uses short-circuiting for improved efficiency. Returns the OLD value
  * that was replaced by the result of this operation.
  */
-template <typename T, typename Oper, typename ShortCircuit>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
-                                                     Oper &&oper,
-                                                     ShortCircuit &&sc)
+template<typename T, typename Oper, typename ShortCircuit>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T* acc,
+                                                     Oper&& oper,
+                                                     ShortCircuit&& sc)
 {
   T old = builtin_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = builtin_atomicCAS(acc, expected, oper(expected));
+    old      = builtin_atomicCAS(acc, expected, oper(expected));
   } while (!builtin_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
 }
 
-
 /*!
  * Atomics implemented using compare and swap loop
  */
@@ -673,63 +654,54 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicCAS_loop(T *acc,
 /*!
  * Atomic addition using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T *acc, T value)
+template<typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAdd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
+  return builtin_atomicCAS_loop(acc, [value](T old) {
     return old + value;
   });
 }
 
-
 /*!
  * Atomic subtraction using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T *acc, T value)
+template<typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicSub(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
+  return builtin_atomicCAS_loop(acc, [value](T old) {
     return old - value;
   });
 }
 
-
 /*!
  * Atomic and using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T *acc, T value)
+template<typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicAnd(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
+  return builtin_atomicCAS_loop(acc, [value](T old) {
     return old & value;
   });
 }
 
-
 /*!
  * Atomic or using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T *acc, T value)
+template<typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicOr(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
+  return builtin_atomicCAS_loop(acc, [value](T old) {
     return old | value;
   });
 }
 
-
 /*!
  * Atomic xor using compare and swap loop
  */
-template <typename T,
-          std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
-RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
+template<typename T, std::enable_if_t<builtin_useCAS<T>::value, bool> = true>
+RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T* acc, T value)
 {
-  return builtin_atomicCAS_loop(acc, [value] (T old) {
+  return builtin_atomicCAS_loop(acc, [value](T old) {
     return old ^ value;
   });
 }
@@ -737,111 +709,112 @@ RAJA_DEVICE_HIP RAJA_INLINE T builtin_atomicXor(T *acc, T value)
 
 }  // namespace detail
 
-
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T *acc)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicLoad(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicLoad(acc);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE void atomicStore(builtin_atomic, T* acc, T value)
 {
   detail::builtin_atomicStore(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAdd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAdd(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicSub(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicSub(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMin(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) {
+        return value < old ? value : old;
+      },
+      [value](T current) {
+        return current <= value;
+      });
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicMax(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) {
+        return old < value ? value : old;
+      },
+      [value](T current) {
+        return value <= current;
+      });
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicAdd(acc, static_cast<T>(1));
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicInc(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc)
 {
   return detail::builtin_atomicSub(acc, static_cast<T>(1));
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicDec(builtin_atomic, T* acc, T value)
 {
-  return detail::builtin_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return detail::builtin_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicAnd(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicAnd(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicOr(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicOr(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicXor(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicXor(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T *acc, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T atomicExchange(builtin_atomic, T* acc, T value)
 {
   return detail::builtin_atomicExchange(acc, value);
 }
 
-template <typename T>
-RAJA_DEVICE_HIP RAJA_INLINE T atomicCAS(builtin_atomic, T *acc, T compare, T value)
+template<typename T>
+RAJA_DEVICE_HIP RAJA_INLINE T
+atomicCAS(builtin_atomic, T* acc, T compare, T value)
 {
   return detail::builtin_atomicCAS(acc, compare, value);
 }
diff --git a/include/RAJA/policy/cuda.hpp b/include/RAJA/policy/cuda.hpp
index e9d5bc454f..40d5e68e4c 100644
--- a/include/RAJA/policy/cuda.hpp
+++ b/include/RAJA/policy/cuda.hpp
@@ -28,7 +28,7 @@
 #include <cuda_runtime.h>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/forall.hpp"
diff --git a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
index 88a89d5362..284c15e117 100644
--- a/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
+++ b/include/RAJA/policy/cuda/MemUtils_CUDA.hpp
@@ -69,9 +69,9 @@ cudaDeviceProp& device_prop()
   return prop;
 }
 
-
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -90,7 +90,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -110,7 +111,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -132,7 +134,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -141,8 +144,10 @@ struct DevicePinnedAllocator {
     cudaErrchk(cudaGetDevice(&device));
     void* ptr;
     cudaErrchk(cudaMallocManaged(&ptr, nbytes, cudaMemAttachGlobal));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
-    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
+    cudaErrchk(
+        cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetPreferredLocation, device));
+    cudaErrchk(cudaMemAdvise(ptr, nbytes, cudaMemAdviseSetAccessedBy,
+                             cudaCpuDeviceId));
 
     return ptr;
   }
@@ -158,22 +163,26 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct cudaInfo {
+struct cudaInfo
+{
   const void* func = nullptr;
-  cuda_dim_t gridDim{0, 0, 0};
-  cuda_dim_t blockDim{0, 0, 0};
+  cuda_dim_t gridDim {0, 0, 0};
+  cuda_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Cuda res{::RAJA::resources::Cuda::CudaFromStream(0,0)};
+  ::RAJA::resources::Cuda res {::RAJA::resources::Cuda::CudaFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct cudaStatusInfo : cudaInfo {
+
+struct cudaStatusInfo : cudaInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -190,10 +199,7 @@ extern cudaStatusInfo tl_status;
 extern std::unordered_map<cudaStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Cuda res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Cuda res) { res.wait(); }
 
 }  // namespace detail
 
@@ -205,13 +211,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     cudaErrchk(cudaDeviceSynchronize());
   }
 }
@@ -224,12 +233,16 @@ void synchronize(::RAJA::resources::Cuda res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -242,29 +255,40 @@ void launch(::RAJA::resources::Cuda res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, cuda_dim_t gridDim, cuda_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Cuda res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            cuda_dim_t gridDim,
+            cuda_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Cuda res,
+            bool async       = true,
+            const char* name = nullptr)
 {
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePushA(name);
+  if (name) nvtxRangePushA(name);
 #else
   RAJA_UNUSED_VAR(name);
 #endif
-  cudaErrchk(cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+  cudaErrchk(
+      cudaLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-  if(name) nvtxRangePop();
+  if (name) nvtxRangePop();
 #endif
   launch(res, async);
 }
@@ -283,9 +307,11 @@ cuda_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                             detail::tl_status.gridDim.y *
-                                             detail::tl_status.gridDim.z; }
+cuda_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -293,9 +319,11 @@ cuda_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-cuda_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                              detail::tl_status.blockDim.y *
-                                              detail::tl_status.blockDim.z; }
+cuda_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -310,7 +338,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -322,24 +351,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template<typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -353,7 +385,7 @@ ::RAJA::resources::Cuda currentResource() { return detail::tl_status.res; }
 // Note: This is done to setup the Reducer and MultiReducer objects through
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
-template <typename LOOP_BODY>
+template<typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     const void* func,
     cuda_dim_t gridDim,
@@ -362,14 +394,14 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     ::RAJA::resources::Cuda res,
     LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(detail::tl_status,
-      detail::cudaInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::cudaInfo> info_sa(
+      detail::tl_status,
+      detail::cudaInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
-
 static constexpr int cuda_occupancy_uninitialized_int = -1;
 static constexpr size_t cuda_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
@@ -378,7 +410,8 @@ static constexpr size_t cuda_occupancy_uninitialized_size_t =
 struct CudaFixedMaxBlocksData
 {
   int device_sm_per_device = cuda::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = cuda::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      cuda::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -394,25 +427,26 @@ CudaFixedMaxBlocksData cuda_max_blocks()
 struct CudaOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = cuda_occupancy_uninitialized_int;
-  int func_max_threads_per_block = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = cuda_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksThreadsData
+cuda_occupancy_max_blocks_threads(const void* func,
+                                  size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
     cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
   }
 
   return data;
@@ -422,54 +456,55 @@ CudaOccMaxBlocksThreadsData cuda_occupancy_max_blocks_threads(const void* func,
 struct CudaOccMaxBlocksData : CudaFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = cuda_occupancy_uninitialized_size_t;
-  int func_threads_per_block = cuda_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = cuda_occupancy_uninitialized_int;
+  int func_threads_per_block          = cuda_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = cuda_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE CudaOccMaxBlocksData
+cuda_occupancy_max_blocks(const void* func,
+                          size_t func_dynamic_shmem_per_block,
+                          int func_threads_per_block)
 {
   static thread_local CudaOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
     cudaErrchk(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
-
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
   }
 
   return data;
 }
 
-
 /*!
  ******************************************************************************
  *
@@ -496,14 +531,16 @@ CudaOccMaxBlocksData cuda_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template<typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -517,10 +554,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -528,7 +569,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -536,16 +578,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -554,8 +597,10 @@ struct ConcretizerImpl
   {
     auto data = cuda_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -563,9 +608,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
index f6269b36e4..32e439b6bb 100644
--- a/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/Dispatcher.hpp
@@ -29,7 +29,6 @@
 #include <thread>
 #include <mutex>
 
-
 namespace RAJA
 {
 
@@ -41,9 +40,9 @@ namespace cuda
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template<typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +51,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     cudaErrchk(cudaFreeHost(ptr));
     cudaErrchk(cudaMallocHost(&ptr, nbytes));
@@ -73,7 +73,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template<typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +81,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Cuda::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   cudaErrchk(cudaLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   cudaErrchk(cudaStreamSynchronize(res.get_stream()));
 
@@ -91,7 +92,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template<typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +102,20 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace cuda
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async >
-inline const Dispatcher_T* get_Dispatcher(cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template<typename T,
+         typename Dispatcher_T,
+         size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async>
+inline const Dispatcher_T* get_Dispatcher(
+    cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return cuda::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return cuda::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
index 41fe17c84a..ead475e923 100644
--- a/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/cuda/WorkGroup/WorkRunner.hpp
@@ -25,7 +25,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
 
-
 namespace RAJA
 {
 
@@ -36,46 +35,48 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallOrdered<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template<typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +84,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +100,48 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
     : WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+          RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+          RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+          RAJA::reverse_ordered,
+          DISPATCH_POLICY_T,
+          ALLOCATOR_T,
+          INDEX_T,
+          Args...>
 {
   using base = WorkRunnerForallReverse<
-        RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+      RAJA::cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+      RAJA::reverse_ordered,
+      DISPATCH_POLICY_T,
+      ALLOCATOR_T,
+      INDEX_T,
+      Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template<typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,28 +149,33 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::cuda::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::cuda::synchronize(r);
+      }
     }
 
     return run_storage;
   }
 };
 
-
 /*!
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template<typename Segment_type,
+         typename LoopBody,
+         typename index_type,
+         typename... Args>
 struct HoldCudaDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template<typename segment_in, typename body_in>
   HoldCudaDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +183,11 @@ struct HoldCudaDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,12 +197,12 @@ struct HoldCudaDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           size_t BLOCKS_PER_SM,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         typename StorageIter,
+         typename value_type,
+         typename index_type,
+         typename... Args>
 __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
     void cuda_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -199,43 +212,49 @@ __launch_bounds__(BLOCK_SIZE, BLOCKS_PER_SM) __global__
   value_type::device_call(&iter[i_loop], args...);
 }
 
-
 /*!
  * Runs work in a storage container out of order with loops mapping to
  * cuda blocks in the y direction and iterations mapping to threads in
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunner<
-        RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
-        RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
+    RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
-  using exec_policy = RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
-  using order_policy = RAJA::policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
+  using exec_policy =
+      RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+  using order_policy = RAJA::policy::cuda::
+      unordered_cuda_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Cuda;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Cuda;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template<typename T>
     using type = HoldCudaDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
+
   ///
-  template < typename T >
+  template<typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -244,21 +263,26 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::cuda, dispatcher_holder_policy, RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>, Args...>;
+  using dispatcher_type =
+      Dispatcher<Platform::cuda,
+                 dispatcher_holder_policy,
+                 RAJA::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>,
+                 Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -268,35 +292,42 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template<typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void enqueue(WorkContainer& storage,
+                      Iterable&& iter,
+                      LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -304,37 +335,45 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template<typename WorkContainer>
+  per_run_storage run(WorkContainer const& storage,
+                      resource_type r,
+                      Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator, value_type, index_type, Args...>;
+    auto func =
+        cuda_unordered_y_block_global<BLOCK_SIZE, BLOCKS_PER_SM, Iterator,
+                                      value_type, index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      cuda_dim_t blockSize{static_cast<cuda_dim_member_t>(block_size), 1, 1};
-      cuda_dim_t gridSize{static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<cuda_dim_member_t>(num_loops),
-                          1};
+      cuda_dim_t blockSize {static_cast<cuda_dim_member_t>(block_size), 1, 1};
+      cuda_dim_t gridSize {
+          static_cast<cuda_dim_member_t>((average_iterations + block_size - 1) /
+                                         block_size),
+          static_cast<cuda_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -347,8 +386,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::cuda::launch((const void*)func, gridSize, blockSize, func_args,
+                           shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -358,10 +398,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
diff --git a/include/RAJA/policy/cuda/atomic.hpp b/include/RAJA/policy/cuda/atomic.hpp
index aedfe91a03..1efebe7736 100644
--- a/include/RAJA/policy/cuda/atomic.hpp
+++ b/include/RAJA/policy/cuda/atomic.hpp
@@ -25,7 +25,8 @@
 #include <stdexcept>
 #include <type_traits>
 
-#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 && __CUDACC_VER_MINOR__ >= 6
+#if __CUDA__ARCH__ >= 600 && __CUDACC_VER_MAJOR__ >= 11 &&                     \
+    __CUDACC_VER_MINOR__ >= 6
 #define RAJA_ENABLE_CUDA_ATOMIC_REF
 #endif
 
@@ -46,7 +47,6 @@
 #include "RAJA/util/TypeConvert.hpp"
 #include "RAJA/util/macros.hpp"
 
-
 // TODO: When we can use if constexpr in C++17, this file can be cleaned up
 
 
@@ -64,15 +64,14 @@ namespace detail
  * operators. More specific type traits are added when needed, such as
  * cuda_useBuiltinExchange below.
  */
-template <typename T>
-struct cuda_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+template<typename T>
+struct cuda_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
-
 /*!
  * Type trait for determining if atomic operators should be implemented
  * by reinterpreting inputs to types that the builtin functions support.
@@ -80,26 +79,24 @@ struct cuda_useBuiltinCommon {
  * type traits are added when needed, such as cuda_useReinterpretExchange
  * below.
  */
-template <typename T>
-struct cuda_useReinterpretCommon {
-  static constexpr bool value =
-    !cuda_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+template<typename T>
+struct cuda_useReinterpretCommon
+{
+  static constexpr bool value = !cuda_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
-
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
 
-
 /*!
  * Performs an atomic bitwise or using a builtin function. Stores the new value
  * in the given address and returns the old value.
@@ -107,14 +104,13 @@ using cuda_useReinterpretCommon_t = typename cuda_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+template<typename T,
+         std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
 
-
 /*!
  * Atomic exchange
  */
@@ -123,44 +119,45 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
  * Type trait for determining if the exchange operator should be implemented
  * using a builtin
  */
-template <typename T>
-struct cuda_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+template<typename T>
+struct cuda_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
  * Type trait for determining if the exchange operator should be implemented
  * by reinterpreting inputs to types that the builtin exchange supports
  */
-template <typename T>
-struct cuda_useReinterpretExchange {
-  static constexpr bool value =
-    !cuda_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+template<typename T>
+struct cuda_useReinterpretExchange
+{
+  static constexpr bool value = !cuda_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
-using cuda_useReinterpretExchange_t = typename cuda_useReinterpretExchange<T>::type;
+template<typename T>
+using cuda_useReinterpretExchange_t =
+    typename cuda_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<cuda_useBuiltinExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -169,59 +166,56 @@ RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<cuda_useReinterpretExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicExchange(T* acc, T value)
 {
   using R = cuda_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicExchange(reinterpret_cast<R*>(acc),
-                        RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
-
 /*!
  * Atomic load and store
  */
 #if defined(RAJA_ENABLE_CUDA_ATOMIC_REF)
 
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+template<typename T>
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).load(
-    cuda::memory_order_relaxed{});
+      cuda::memory_order_relaxed {});
 }
 
-
-template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+template<typename T>
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda::atomic_ref<T, cuda::thread_scope_device>(*acc).store(
-    value, cuda::memory_order_relaxed{});
+      value, cuda::memory_order_relaxed {});
 }
 
 #else
 
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   return cuda_atomicOr(acc, static_cast<T>(0));
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicLoad(T* acc)
 {
   using R = cuda_useReinterpretCommon_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicLoad(reinterpret_cast<R*>(acc)));
+      cuda_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
-template <typename T>
-RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
+template<typename T>
+RAJA_INLINE __device__ void cuda_atomicStore(T* acc, T value)
 {
   cuda_atomicExchange(acc, value);
 }
@@ -237,15 +231,15 @@ RAJA_INLINE __device__ void cuda_atomicStore(T *acc, T value)
  * Type trait for determining if the compare and swap operator should be
  * implemented using a builtin
  */
-template <typename T>
-struct cuda_useBuiltinCAS {
+template<typename T>
+struct cuda_useBuiltinCAS
+{
   static constexpr bool value =
 #if __CUDA_ARCH__ >= 700
-    std::is_same<T, unsigned short int>::value ||
+      std::is_same<T, unsigned short int>::value ||
 #endif
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+      std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+      std::is_same<T, unsigned long long>::value;
 };
 
 /*!
@@ -253,55 +247,53 @@ struct cuda_useBuiltinCAS {
  * implemented by reinterpreting inputs to types that the builtin compare
  * and swap supports
  */
-template <typename T>
-struct cuda_useReinterpretCAS {
-  static constexpr bool value =
-    !cuda_useBuiltinCAS<T>::value &&
-    (
+template<typename T>
+struct cuda_useReinterpretCAS
+{
+  static constexpr bool value = !cuda_useBuiltinCAS<T>::value &&
+                                (
 #if __CUDA_ARCH__ >= 700
-     sizeof(T) == sizeof(unsigned short) ||
+                                    sizeof(T) == sizeof(unsigned short) ||
 #endif
-     sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long)
-    );
+                                    sizeof(T) == sizeof(unsigned int) ||
+                                    sizeof(T) == sizeof(unsigned long long));
 
   using type =
 #if __CUDA_ARCH__ >= 700
-    std::conditional_t<sizeof(T) == sizeof(unsigned short),
-                       unsigned short,
+      std::conditional_t<sizeof(T) == sizeof(unsigned short),
+                         unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int,
-                       unsigned long long>
+                         std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                            unsigned int,
+                                            unsigned long long>
 #if __CUDA_ARCH__ >= 700
-                      >
+                         >
 #endif
-    ;
+      ;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using cuda_useReinterpretCAS_t = typename cuda_useReinterpretCAS<T>::type;
 
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<cuda_useBuiltinCAS<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
-RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<cuda_useReinterpretCAS<T>::value, bool> = true>
+RAJA_INLINE __device__ T cuda_atomicCAS(T* acc, T compare, T value)
 {
   using R = cuda_useReinterpretCAS_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    cuda_atomicCAS(reinterpret_cast<R*>(acc),
-                   RAJA::util::reinterp_A_as_B<T, R>(compare),
-                   RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(cuda_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 /*!
@@ -309,15 +301,15 @@ RAJA_INLINE __device__ T cuda_atomicCAS(T *acc, T compare, T value)
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
+template<typename T,
+         std::enable_if_t<cuda_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <typename T,
-          std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
+template<typename T,
+         std::enable_if_t<cuda_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
 {
   using R = cuda_useReinterpretCommon_t<T>;
@@ -326,304 +318,306 @@ RAJA_INLINE __device__ bool cuda_atomicCAS_equal(const T& a, const T& b)
                               RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
-
 /*!
  * Generic impementation of any atomic 32-bit or 64-bit operator.
  * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
  * operator. Returns the OLD value that was replaced by the result of this
  * operation.
  */
-template <typename T, typename Oper>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
-                                             Oper&& oper)
+template<typename T, typename Oper>
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = cuda_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected));
 
   return old;
 }
 
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing CUDA supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing CUDA supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
-template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T cuda_atomicCAS_loop(T *acc,
+template<typename T, typename Oper, typename ShortCircuit>
+RAJA_INLINE __device__ T cuda_atomicCAS_loop(T* acc,
                                              Oper&& oper,
                                              ShortCircuit&& sc)
 {
   T old = cuda_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = cuda_atomicCAS(acc, expected, oper(expected));
+    old      = cuda_atomicCAS(acc, expected, oper(expected));
   } while (!cuda_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
 }
 
-
 /*!
  * Atomic addition
  */
-using cuda_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int,
-  float
+using cuda_atomicAdd_builtin_types = ::camp::list<int,
+                                                  unsigned int,
+                                                  unsigned long long int,
+                                                  float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                                                  ,
+                                                  double
 #endif
->;
+                                                  >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicAdd_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return old + value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAdd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
 
-
 /*!
  * Atomic subtract
  */
 using cuda_atomicSub_builtin_types = cuda_atomicAdd_builtin_types;
 
-using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using cuda_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
-using cuda_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long int,
-  float
+using cuda_atomicSub_via_Add_builtin_types =
+    ::camp::list<unsigned long long int,
+                 float
 #if __CUDA_ARCH__ >= 600
-  ,
-  double
+                 ,
+                 double
 #endif
->;
+                 >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicSub_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return old - value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicSub(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
 
-
 /*!
  * Atomic min/max
  */
-using cuda_atomicMinMax_builtin_types = ::camp::list<
-  int,
-  unsigned int
+using cuda_atomicMinMax_builtin_types = ::camp::list<int,
+                                                     unsigned int
 #if __CUDA_ARCH__ >= 500
-  ,
-  long long int,
-  unsigned long long int
+                                                     ,
+                                                     long long int,
+                                                     unsigned long long int
 #endif
->;
+                                                     >;
 
 
 /*!
  * Atomic min
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) {
+        return value < old ? value : old;
+      },
+      [value](T current) {
+        return current <= value;
+      });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMin(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
 
-
 /*!
  * Atomic max
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicMinMax_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return cuda_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) {
+        return old < value ? value : old;
+      },
+      [value](T current) {
+        return value <= current;
+      });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicMax(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_any_of<T, cuda_atomicMinMax_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
 
-
 /*!
  * Atomic increment/decrement with reset
  */
-using cuda_atomicIncDecReset_builtin_types = ::camp::list<
-  unsigned int
->;
-
+using cuda_atomicIncDecReset_builtin_types = ::camp::list<unsigned int>;
 
 /*!
  * Atomic increment with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc, T value)
 {
   return ::atomicInc(acc, value);
 }
 
-
 /*!
  * Atomic increment (implemented in terms of atomic addition)
  */
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicInc(T *acc)
+template<typename T>
+RAJA_INLINE __device__ T cuda_atomicInc(T* acc)
 {
   return cuda_atomicAdd(acc, static_cast<T>(1));
 }
 
-
 /*!
  * Atomic decrement with reset
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return cuda_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicIncDecReset_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc, T value)
 {
   return ::atomicDec(acc, value);
 }
 
-
 /*!
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
-template <typename T>
-RAJA_INLINE __device__ T cuda_atomicDec(T *acc)
+template<typename T>
+RAJA_INLINE __device__ T cuda_atomicDec(T* acc)
 {
   return cuda_atomicSub(acc, static_cast<T>(1));
 }
 
-
 /*!
  * Atomic bitwise functions (and, or, xor)
  */
-using cuda_atomicBit_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long int
->;
-
+using cuda_atomicBit_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long int>;
 
 /*!
  * Atomic and
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return old & value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicAnd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
 
-
 /*!
  * Atomic or
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicOr(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return old | value;
   });
 }
@@ -637,18 +631,20 @@ RAJA_INLINE __device__ T cuda_atomicOr(T *acc, T value)
 /*!
  * Atomic xor
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+template<typename T,
+         RAJA::util::enable_if_is_none_of<T, cuda_atomicBit_builtin_types>* =
+             nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
-  return cuda_atomicCAS_loop(acc, [value] (T old) {
+  return cuda_atomicCAS_loop(acc, [value](T old) {
     return old ^ value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, cuda_atomicBit_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T cuda_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -656,7 +652,6 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
 
 }  // namespace detail
 
-
 /*!
  * Catch-all policy passes off to CUDA's builtin atomics.
  *
@@ -666,186 +661,197 @@ RAJA_INLINE __device__ T cuda_atomicXor(T *acc, T value)
  * These are atomic in cuda device code and non-atomic otherwise
  */
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(cuda_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(cuda_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   detail::cuda_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
   return detail::cuda_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(cuda_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
   return detail::cuda_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(cuda_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(cuda_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(cuda_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(cuda_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(cuda_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(cuda_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
+template<typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(cuda_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(cuda_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #ifdef __CUDA_ARCH__
   return detail::cuda_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/cuda/forall.hpp b/include/RAJA/policy/cuda/forall.hpp
index 493136400c..d305e0c1e4 100644
--- a/include/RAJA/policy/cuda/forall.hpp
+++ b/include/RAJA/policy/cuda/forall.hpp
@@ -70,61 +70,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_cuda_dim<dim>(dims.threads,
+                                static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_cuda_dim<dim>(dims.blocks,
+                                static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -132,24 +162,34 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -157,18 +197,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 };
 
 template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -177,46 +223,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
@@ -224,24 +291,34 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_cuda_dim<dim>(dims.threads, block_size);
     internal::set_cuda_dim<dim>(dims.blocks, grid_size);
@@ -249,18 +326,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
 };
 
 template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::cuda::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::CudaDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::CudaDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::cuda::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -284,223 +367,235 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
  *
  ******************************************************************************
  */
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<typename EXEC_POL,
+         size_t BlocksPerSM,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size > 0),
+                          size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
+
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template<typename EXEC_POL,
+         size_t BlocksPerSM,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size <= 0),
+                          size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_cuda_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<typename EXEC_POL,
+         size_t BlocksPerSM,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename ForallParam,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size > 0),
+                          size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
+
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template<typename EXEC_POL,
+         size_t BlocksPerSM,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename ForallParam,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size <= 0),
+                          size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_cuda_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
+
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_cuda_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template<
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_cuda_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_cuda_kernel(LOOP_BODY loop_body,
+                             const Iterator idx,
+                             IndexType length,
+                             ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
+
 ///
-template <typename EXEC_POL,
-          size_t BlocksPerSM,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_cuda_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template<
+    typename EXEC_POL,
+    size_t BlocksPerSM,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_cuda_kernel(LOOP_BODY loop_body,
+                                    const Iterator idx,
+                                    IndexType length,
+                                    ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -515,37 +610,50 @@ void forallp_cuda_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
-          typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template<typename Iterable,
+         typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BlocksPerSM,
+         bool Async,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
@@ -568,14 +676,16 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
     }
 
     RAJA_FT_END;
@@ -584,42 +694,56 @@ forall_impl(resources::Cuda cuda_res,
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
-          typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Cuda>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
+template<typename Iterable,
+         typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BlocksPerSM,
+         bool Async,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Cuda cuda_res,
-            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async> const&,
+            ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                     IterationGetter,
+                                                     Concretizer,
+                                                     BlocksPerSM,
+                                                     Async> const&,
             Iterable&& iter,
             LoopBody&& loop_body,
             ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL = ::RAJA::policy::cuda::cuda_exec_explicit<
+      IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>;
+  using UniqueMarker =
+      ::camp::list<IterationMapping, IterationGetter, camp::num<BlocksPerSM>,
+                   LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
-        &impl::forallp_cuda_kernel< EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
-                                   IndexType, camp::decay<ForallParam> >);
+        &impl::forallp_cuda_kernel<EXEC_POL, BlocksPerSM, Iterator, LOOP_BODY,
+                                   IndexType, camp::decay<ForallParam>>);
 
     //
     // Setup shared memory buffers
@@ -635,9 +759,9 @@ forall_impl(resources::Cuda cuda_res,
     RAJA_FT_BEGIN;
 
     RAJA::cuda::detail::cudaInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = cuda_res;
+    launch_info.res      = cuda_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -645,14 +769,17 @@ forall_impl(resources::Cuda cuda_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::cuda::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, cuda_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::cuda::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, cuda_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::cuda::launch(func, dims.blocks, dims.threads, args, shmem, cuda_res,
+                         Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -663,7 +790,6 @@ forall_impl(resources::Cuda cuda_res,
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -682,23 +808,33 @@ forall_impl(resources::Cuda cuda_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BlocksPerSM, bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Cuda>
-forall_impl(resources::Cuda r,
-            ExecPolicy<seq_segit, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+template<typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BlocksPerSM,
+         bool Async,
+         typename... SegmentTypes>
+RAJA_INLINE resources::EventProxy<resources::Cuda> forall_impl(
+    resources::Cuda r,
+    ExecPolicy<seq_segit,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BlocksPerSM,
+                                                        Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BlocksPerSM, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                 IterationGetter, Concretizer,
+                                                 BlocksPerSM, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::cuda::synchronize(r);
diff --git a/include/RAJA/policy/cuda/intrinsics.hpp b/include/RAJA/policy/cuda/intrinsics.hpp
index b2daa3a23e..6953140358 100644
--- a/include/RAJA/policy/cuda/intrinsics.hpp
+++ b/include/RAJA/policy/cuda/intrinsics.hpp
@@ -35,7 +35,6 @@
 
 #include "RAJA/policy/cuda/policy.hpp"
 
-
 namespace RAJA
 {
 
@@ -59,15 +58,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,49 +89,47 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template<typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
     }
 
     return u.get_value();
   }
 
-  template < typename T >
+  template<typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
       atomicExch(&ptr[i], u.array[i]);
     }
   }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
-
 // cuda 8 only has shfl primitives for 32 bits while cuda 9 has 32 and 64 bits
 constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
@@ -157,13 +148,16 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
  *
  ******************************************************************************
  */
-template <typename T>
+template<typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_xor_sync(0xffffffffu, u.array[i], laneMask);
 #else
@@ -173,13 +167,16 @@ RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
   return u.get_value();
 }
 
-template <typename T>
+template<typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
     u.array[i] = ::__shfl_sync(0xffffffffu, u.array[i], srcLane);
 #else
@@ -191,49 +188,56 @@ RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(unsigned int var, int laneMask)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_xor_sync<unsigned int>(
+    unsigned int var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE long shfl_xor_sync<long>(long var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(unsigned long var, int laneMask)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_xor_sync<unsigned long>(
+    unsigned long var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var, int laneMask)
+template<>
+RAJA_DEVICE RAJA_INLINE long long shfl_xor_sync<long long>(long long var,
+                                                           int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(unsigned long long var, int laneMask)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_xor_sync<unsigned long long>(
+    unsigned long long var,
+    int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
 {
   return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
@@ -241,13 +245,13 @@ RAJA_DEVICE RAJA_INLINE double shfl_xor_sync<double>(double var, int laneMask)
 
 #else
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 {
   return ::__shfl_xor(var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
 {
   return ::__shfl_xor(var, laneMask);
@@ -258,49 +262,55 @@ RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
 
 #if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var, int srcLane)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned int shfl_sync<unsigned int>(unsigned int var,
+                                                             int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE long shfl_sync<long>(long var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(unsigned long var, int srcLane)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned long shfl_sync<unsigned long>(
+    unsigned long var,
+    int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var, int srcLane)
+template<>
+RAJA_DEVICE RAJA_INLINE long long shfl_sync<long long>(long long var,
+                                                       int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
-RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(unsigned long long var, int srcLane)
+template<>
+RAJA_DEVICE RAJA_INLINE unsigned long long shfl_sync<unsigned long long>(
+    unsigned long long var,
+    int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
 {
   return ::__shfl_sync(0xffffffffu, var, srcLane);
@@ -308,13 +318,13 @@ RAJA_DEVICE RAJA_INLINE double shfl_sync<double>(double var, int srcLane)
 
 #else
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 {
   return ::__shfl(var, srcLane);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
 {
   return ::__shfl(var, srcLane);
@@ -324,7 +334,7 @@ RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
 
 
 //! reduce values in block into thread 0
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 {
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -334,23 +344,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -365,22 +380,22 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
  * This does a butterfly pattern leaving each lane with the full reduction
  *
  */
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = __shfl_xor_sync(0xffffffff, temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
 }
 
-
 //! reduce values in block into thread 0
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -388,65 +403,81 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::cuda::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::cuda::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > policy::cuda::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::cuda::device_constants.MAX_WARPS <= policy::cuda::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::cuda::device_constants.MAX_WARPS <=
+                      policy::cuda::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
     RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::cuda::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/cuda/kernel/Conditional.hpp b/include/RAJA/policy/cuda/kernel/Conditional.hpp
index ff15848bcb..7a5d3f5c84 100644
--- a/include/RAJA/policy/cuda/kernel/Conditional.hpp
+++ b/include/RAJA/policy/cuda/kernel/Conditional.hpp
@@ -36,35 +36,29 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         typename Conditional,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
     }
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
index 7465f515b0..cd7e33643b 100644
--- a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
+++ b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -50,8 +50,9 @@ namespace RAJA
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
  * Blocks per SM must be chosen by the user.
  */
-template <bool async0, int num_blocks, int num_threads, int blocks_per_sm>
-struct cuda_explicit_launch {};
+template<bool async0, int num_blocks, int num_threads, int blocks_per_sm>
+struct cuda_explicit_launch
+{};
 
 /*!
  * CUDA kernel launch policy where the user specifies the number of physical
@@ -66,16 +67,23 @@ struct cuda_explicit_launch {};
  * Num_threads is 1024, which may not be appropriate for all kernels.
  * Blocks per SM defaults to 1.
  */
-template <bool async0, int num_blocks, int num_threads>
-using cuda_launch = cuda_explicit_launch<async0, num_blocks, num_threads, policy::cuda::MIN_BLOCKS_PER_SM>;
+template<bool async0, int num_blocks, int num_threads>
+using cuda_launch = cuda_explicit_launch<async0,
+                                         num_blocks,
+                                         num_threads,
+                                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 /*!
  * CUDA kernel launch policy where the number of physical blocks and threads
  * are determined by the CUDA occupancy calculator.
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
-template <int num_threads0, bool async0>
-using cuda_occ_calc_launch = cuda_explicit_launch<async0, 0, num_threads0, policy::cuda::MIN_BLOCKS_PER_SM>;
+template<int num_threads0, bool async0>
+using cuda_occ_calc_launch =
+    cuda_explicit_launch<async0,
+                         0,
+                         num_threads0,
+                         policy::cuda::MIN_BLOCKS_PER_SM>;
 
 namespace statement
 {
@@ -85,11 +93,13 @@ namespace statement
  * Note - Statement requires a placeholder cuda_exec policy for the sake of
  * object oriented inheritance.
  */
-template <typename LaunchConfig, typename... EnclosedStmts>
+template<typename LaunchConfig, typename... EnclosedStmts>
 struct CudaKernelExt
-    : public internal::Statement<::RAJA::policy::cuda::cuda_exec_explicit<LaunchConfig, void, void, 0, true>, EnclosedStmts...> {
-};
-
+    : public internal::Statement<
+          ::RAJA::policy::cuda::
+              cuda_exec_explicit<LaunchConfig, void, void, 0, true>,
+          EnclosedStmts...>
+{};
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -97,9 +107,9 @@ struct CudaKernelExt
  * calculator determine the unspecified values.
  * The kernel launch is synchronous.
  */
-template <int num_blocks, int num_threads, typename... EnclosedStmts>
-using CudaKernelExp =
-    CudaKernelExt<cuda_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+template<int num_blocks, int num_threads, typename... EnclosedStmts>
+using CudaKernelExp = CudaKernelExt<cuda_launch<false, num_blocks, num_threads>,
+                                    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with the flexibility
@@ -107,7 +117,7 @@ using CudaKernelExp =
  * calculator determine the unspecified values.
  * The kernel launch is asynchronous.
  */
-template <int num_blocks, int num_threads, typename... EnclosedStmts>
+template<int num_blocks, int num_threads, typename... EnclosedStmts>
 using CudaKernelExpAsync =
     CudaKernelExt<cuda_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
 
@@ -116,7 +126,7 @@ using CudaKernelExpAsync =
  * CUDA occupancy calculator to determine the optimal number of threads.
  * The kernel launch is synchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using CudaKernelOcc =
     CudaKernelExt<cuda_occ_calc_launch<1024, false>, EnclosedStmts...>;
 
@@ -125,7 +135,7 @@ using CudaKernelOcc =
  * CUDA occupancy calculator to determine the optimal number of threads.
  * The kernel launch is asynchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using CudaKernelOccAsync =
     CudaKernelExt<cuda_occ_calc_launch<1024, true>, EnclosedStmts...>;
 
@@ -134,17 +144,17 @@ using CudaKernelOccAsync =
  * number of threads (specified by num_threads)
  * The kernel launch is synchronous.
  */
-template <int num_threads, typename... EnclosedStmts>
-using CudaKernelFixed =
-    CudaKernelExt<cuda_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+template<int num_threads, typename... EnclosedStmts>
+using CudaKernelFixed = CudaKernelExt<
+    cuda_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with a fixed
  * number of threads (specified by num_threads)
  * The kernel launch is asynchronous.
  */
-template <int num_threads, typename... EnclosedStmts>
+template<int num_threads, typename... EnclosedStmts>
 using CudaKernelFixedAsync =
     CudaKernelExt<cuda_launch<true, operators::limits<int>::max(), num_threads>,
                   EnclosedStmts...>;
@@ -154,9 +164,12 @@ using CudaKernelFixedAsync =
  * number of threads (specified by num_threads) and min blocks per sm.
  * The kernel launch is synchronous.
  */
-template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
+template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSM =
-    CudaKernelExt<cuda_explicit_launch<false, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<false,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
@@ -164,23 +177,26 @@ using CudaKernelFixedSM =
  * number of threads (specified by num_threads) and min blocks per sm.
  * The kernel launch is asynchronous.
  */
-template <int num_threads, int blocks_per_sm, typename... EnclosedStmts>
+template<int num_threads, int blocks_per_sm, typename... EnclosedStmts>
 using CudaKernelFixedSMAsync =
-    CudaKernelExt<cuda_explicit_launch<true, operators::limits<int>::max(), num_threads, blocks_per_sm>,
+    CudaKernelExt<cuda_explicit_launch<true,
+                                       operators::limits<int>::max(),
+                                       num_threads,
+                                       blocks_per_sm>,
                   EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with 1024 threads
  * The kernel launch is synchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using CudaKernel = CudaKernelFixed<1024, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a CUDA kernel with 1024 threads
  * The kernel launch is asynchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using CudaKernelAsync = CudaKernelFixedAsync<1024, EnclosedStmts...>;
 
 }  // namespace statement
@@ -192,17 +208,16 @@ namespace internal
 /*!
  * CUDA global function for launching CudaKernel policies
  */
-template <typename Data, typename Exec>
+template<typename Data, typename Exec>
 __global__ void CudaKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
 }
 
-
 /*!
  * CUDA global function for launching CudaKernel policies
  * This is annotated to guarantee that device code generated
@@ -210,19 +225,18 @@ __global__ void CudaKernelLauncher(Data data)
  *
  * This launcher is used by the CudaKerelFixed policies.
  */
-template <int BlockSize, int BlocksPerSM, typename Data, typename Exec>
+template<int BlockSize, int BlocksPerSM, typename Data, typename Exec>
 __launch_bounds__(BlockSize, BlocksPerSM) __global__
     void CudaKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
   Exec::exec(private_data, true);
 }
 
-
 /*!
  * Helper class that handles getting the correct global function for
  * CudaKernel policies. This class is specialized on whether or not BlockSize
@@ -234,10 +248,16 @@ __launch_bounds__(BlockSize, BlocksPerSM) __global__
 template<int BlockSize, int BlocksPerSM, typename Data, typename executor_t>
 struct CudaKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncherFixed<BlockSize,
+                                                              BlocksPerSM,
+                                                              Data,
+                                                              executor_t>)>;
+
   static constexpr type get() noexcept
   {
-    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data, executor_t>;
+    return &internal::CudaKernelLauncherFixed<BlockSize, BlocksPerSM, Data,
+                                              executor_t>;
   }
 };
 
@@ -248,7 +268,9 @@ struct CudaKernelLauncherGetter
 template<typename Data, typename executor_t>
 struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::CudaKernelLauncher<Data, executor_t>)>;
+
   static constexpr type get() noexcept
   {
     return &internal::CudaKernelLauncher<Data, executor_t>;
@@ -256,30 +278,46 @@ struct CudaKernelLauncherGetter<0, 0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles CUDA kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template<typename LaunchPolicy,
+         typename StmtList,
+         typename Data,
+         typename Types>
 struct CudaLaunchHelper;
 
-
 /*!
  * Helper class specialization to determine the number of threads and blocks.
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the CUDA occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, int blocks_per_sm, typename StmtList, typename Data, typename Types>
-struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,StmtList,Data,Types>
+template<bool async0,
+         int num_blocks,
+         int num_threads,
+         int blocks_per_sm,
+         typename StmtList,
+         typename Data,
+         typename Types>
+struct CudaLaunchHelper<
+    cuda_explicit_launch<async0, num_blocks, num_threads, blocks_per_sm>,
+    StmtList,
+    Data,
+    Types>
 {
   using Self = CudaLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::cuda_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, (blocks_per_sm <= 0) ? 0 : blocks_per_sm, Data, executor_t>;
+  using kernelGetter_t =
+      CudaKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                               (blocks_per_sm <= 0) ? 0 : blocks_per_sm,
+                               Data,
+                               executor_t>;
 
   inline static const void* get_func()
   {
@@ -287,13 +325,16 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -301,10 +342,11 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         //
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -314,69 +356,74 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
 
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
   inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+                                int& max_blocks,
+                                int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -384,8 +431,9 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -393,16 +441,15 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
         auto data = ::RAJA::cuda::cuda_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -416,8 +463,10 @@ struct CudaLaunchHelper<cuda_explicit_launch<async0, num_blocks, num_threads, bl
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t minimum = cuda_dim_t()){
+inline cuda_dim_t fitCudaDims(cuda_dim_member_t limit,
+                              cuda_dim_t result,
+                              cuda_dim_t minimum = cuda_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -430,12 +479,13 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -443,9 +493,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -453,9 +504,10 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -464,24 +516,26 @@ cuda_dim_t fitCudaDims(cuda_dim_member_t limit, cuda_dim_t result, cuda_dim_t mi
   return result;
 }
 
-
 /*!
  * Specialization that launches CUDA kernels for RAJA::kernel from host code
  */
-template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
+template<typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::CudaKernelExt<LaunchConfig, EnclosedStmts...>;
 
-  template <typename Data>
-  static inline void exec(Data &&data)
+  template<typename Data>
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        cuda_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = CudaLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -495,9 +549,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -510,8 +565,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -524,24 +579,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      cuda_dim_t fit_threads{0,0,0};
-
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
+      cuda_dim_t fit_threads {0, 0, 0};
 
-        fit_threads = fitCudaDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitCudaDims(recommended_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitCudaDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitCudaDims(max_threads, launch_dims.dims.threads,
+                                  launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -555,24 +610,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitCudaDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitCudaDims(use_blocks, launch_dims.dims.blocks,
+                                            launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -581,7 +637,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -595,14 +652,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto cuda_data = RAJA::cuda::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto cuda_data = RAJA::cuda::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&cuda_data};
-        RAJA::cuda::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&cuda_data};
+        RAJA::cuda::launch(func, launch_dims.dims.blocks,
+                           launch_dims.dims.threads, args, shmem, res,
+                           launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/cuda/kernel/For.hpp b/include/RAJA/policy/cuda/kernel/For.hpp
index 58ffa1ba14..a56719f3e2 100644
--- a/include/RAJA/policy/cuda/kernel/For.hpp
+++ b/include/RAJA/policy/cuda/kernel/For.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -36,18 +35,21 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +62,13 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +81,13 @@ struct CudaStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -100,17 +101,21 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,20 +128,22 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
-
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +158,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -172,17 +178,21 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::cuda::cuda_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -195,20 +205,22 @@ struct CudaStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+      RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +230,13 @@ struct CudaStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -233,63 +244,65 @@ struct CudaStatementExecutor<
   }
 };
 
-
 /*
  * Executor for sequential loops inside of a CudaKernel.
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
-
+    : CudaStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 /*
  * Executor for thread work sharing loop inside CudaKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_direct<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +312,10 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +330,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -329,44 +339,44 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_warp_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -381,10 +391,7 @@ struct CudaStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +406,7 @@ struct CudaStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -408,33 +415,32 @@ struct CudaStatementExecutor<
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+    Data,
+    statement::For<ArgumentId,
+                   RAJA::cuda_thread_masked_direct<Mask>,
+                   EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +450,10 @@ struct CudaStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +469,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -475,42 +478,40 @@ struct CudaStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::cuda_thread_masked_loop<Mask>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -525,10 +526,7 @@ struct CudaStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +542,7 @@ struct CudaStatementExecutor<
     set_cuda_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/kernel/ForICount.hpp b/include/RAJA/policy/cuda/kernel/ForICount.hpp
index 87556ed8b1..500177353b 100644
--- a/include/RAJA/policy/cuda/kernel/ForICount.hpp
+++ b/include/RAJA/policy/cuda/kernel/ForICount.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -37,42 +36,49 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                      sync,
+                                                      IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -94,46 +100,60 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -157,46 +177,60 @@ struct CudaStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -208,7 +242,6 @@ struct CudaStatementExecutor<
   }
 };
 
-
 /*
  * Executor for sequential loops inside of a CudaKernel.
  *
@@ -216,68 +249,79 @@ struct CudaStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: CudaStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
-
+    : CudaStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 /*
  * Executor for thread work sharing loop inside CudaKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,63 +332,69 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside CudaKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::cuda::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
                 "BitMask is too large for CUDA warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,50 +409,54 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside CudaKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
-          Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::cuda_thread_masked_direct<Mask>,
+                     EnclosedStmts...>,
+      Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,60 +467,65 @@ struct CudaStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside CudaKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::cuda_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public CudaStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = CudaStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public CudaStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::cuda_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::cuda_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      CudaStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::cuda_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +540,6 @@ struct CudaStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
index fd33192a65..2957c53d33 100644
--- a/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/cuda/kernel/Hyperplane.hpp
@@ -36,38 +36,36 @@ namespace internal
 {
 
 
-template <typename Data,
-          camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
-struct CudaStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                             Types> {
+template<typename Data,
+         camp::idx_t HpArgumentId,
+         camp::idx_t... Args,
+         typename... EnclosedStmts,
+         typename Types>
+struct CudaStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      CudaStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct CudaStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -92,19 +91,13 @@ struct CudaStatementExecutor<Data,
     }
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
index 258cd204d6..bd2663bf07 100644
--- a/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/cuda/kernel/InitLocalMem.hpp
@@ -39,27 +39,29 @@ struct cuda_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template<typename Data,
+         camp::idx_t... Indices,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<Data,
                              statement::InitLocalMem<RAJA::cuda_shared_mem,
-                             camp::idx_seq<Indices...>, EnclosedStmts...>,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
                              Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  //Launch loops
+  // Launch loops
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,88 +69,77 @@ struct CudaStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
+  // Intialize local array
+  // Identifies type + number of elements needed
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
+  // Set pointer to null base case
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
-
-  //Set pointer to null recursive case
+  // Set pointer to null recursive case
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
     setPtrToNull<other0, others...>(data);
   }
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
-
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template<typename Data,
+         camp::idx_t... Indices,
+         typename... EnclosedStmts,
+         typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::InitLocalMem<RAJA::cuda_thread_mem,
+                                                     camp::idx_seq<Indices...>,
+                                                     EnclosedStmts...>,
+                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  //Launch loops
+  // Launch loops
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,72 +147,55 @@ struct CudaStatementExecutor<Data, statement::InitLocalMem<RAJA::cuda_thread_mem
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
+  // Intialize local array
+  // Identifies type + number of elements needed
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
+  // Set pointer to null base case
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
-
-  //Set pointer to null recursive case
+  // Set pointer to null recursive case
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
     setPtrToNull<other0, others...>(data);
   }
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
-
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Lambda.hpp b/include/RAJA/policy/cuda/kernel/Lambda.hpp
index e932a3e270..28c41db378 100644
--- a/include/RAJA/policy/cuda/kernel/Lambda.hpp
+++ b/include/RAJA/policy/cuda/kernel/Lambda.hpp
@@ -34,36 +34,38 @@
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
 
-
 namespace RAJA
 {
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct CudaStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template<typename Data,
+         camp::idx_t LambdaIndex,
+         typename... Args,
+         typename Types>
+struct CudaStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Reduce.hpp b/include/RAJA/policy/cuda/kernel/Reduce.hpp
index 7e46748991..c7916e56df 100644
--- a/include/RAJA/policy/cuda/kernel/Reduce.hpp
+++ b/include/RAJA/policy/cuda/kernel/Reduce.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/cuda/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -34,32 +33,34 @@ namespace internal
 //
 // Executor that handles reductions across a single CUDA thread block
 //
-template <typename Data,
-          template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         template<typename...>
+         class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<Data,
                              statement::Reduce<RAJA::cuda_block_reduce,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -73,15 +74,15 @@ struct CudaStatementExecutor<Data,
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -89,57 +90,57 @@ struct CudaStatementExecutor<Data,
   }
 };
 
-
 //
 // Executor that handles reductions across a single CUDA thread warp
 //
-template <typename Data,
-          template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         template<typename...>
+         class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<Data,
                              statement::Reduce<RAJA::cuda_warp_reduce,
                                                ReduceOperator,
                                                ParamId,
                                                EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = CudaStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::cuda::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +149,6 @@ struct CudaStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/kernel/Sync.hpp b/include/RAJA/policy/cuda/kernel/Sync.hpp
index 7dd45d8837..9c25b56e77 100644
--- a/include/RAJA/policy/cuda/kernel/Sync.hpp
+++ b/include/RAJA/policy/cuda/kernel/Sync.hpp
@@ -34,7 +34,6 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
-
 namespace RAJA
 {
 namespace statement
@@ -43,52 +42,53 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncthreads().
  */
-struct CudaSyncThreads : public internal::Statement<camp::nil> {
-};
+struct CudaSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a CUDA __syncwarp().
  */
-struct CudaSyncWarp : public internal::Statement<camp::nil> {
-};
+struct CudaSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
 namespace internal
 {
 
-template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types> {
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+template<typename Data, typename Types>
+struct CudaStatementExecutor<Data, statement::CudaSyncThreads, Types>
+{
 
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
-template <typename Data, typename Types>
-struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types> {
+template<typename Data, typename Types>
+struct CudaStatementExecutor<Data, statement::CudaSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
+  static inline RAJA_DEVICE
 #if CUDART_VERSION >= 9000
-  void exec(Data &, bool) { __syncwarp(); }
+      void
+      exec(Data&, bool)
+  {
+    __syncwarp();
+  }
 #else
-  void exec(Data &, bool) {  }
+      void
+      exec(Data&, bool)
+  {
+  }
 #endif
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/cuda/kernel/Tile.hpp b/include/RAJA/policy/cuda/kernel/Tile.hpp
index ad901f6b02..35d841414e 100644
--- a/include/RAJA/policy/cuda/kernel/Tile.hpp
+++ b/include/RAJA/policy/cuda/kernel/Tile.hpp
@@ -47,21 +47,23 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::cuda::
+          cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -133,19 +137,24 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -223,19 +238,24 @@ struct CudaStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct CudaStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct CudaStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     CudaDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -303,28 +329,34 @@ struct CudaStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel cuda_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/TileTCount.hpp b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
index c611346d46..0782ce6aac 100644
--- a/include/RAJA/policy/cuda/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/cuda/kernel/TileTCount.hpp
@@ -48,52 +48,59 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::
+            cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::
+                  cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::
+              cuda_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -120,59 +127,73 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -198,59 +219,73 @@ struct CudaStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::cuda::cuda_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public CudaStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = CudaStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::cuda::cuda_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -265,29 +300,37 @@ struct CudaStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel cuda_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct CudaStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: CudaStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   cuda::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : CudaStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::cuda::cuda_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  cuda::IndexGlobal<named_dim::x,
+                                    named_usage::ignored,
+                                    named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/cuda/kernel/internal.hpp b/include/RAJA/policy/cuda/kernel/internal.hpp
index 9c904ea45a..36f1fdcbbf 100644
--- a/include/RAJA/policy/cuda/kernel/internal.hpp
+++ b/include/RAJA/policy/cuda/kernel/internal.hpp
@@ -37,36 +37,33 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/policy/cuda/policy.hpp"
 
-
 namespace RAJA
 {
 
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   CudaDims dims;
   CudaDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(CudaDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(CudaDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(CudaDims _dims, CudaDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims},
+        min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,51 +79,50 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
-
+  int num_threads() const { return dims.num_threads(); }
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
-
-template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper {
+template<camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
+struct CudaStatementListExecutorHelper
+{
 
   using next_helper_t =
       CudaStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
 
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -135,9 +131,8 @@ struct CudaStatementListExecutorHelper {
     next_helper_t::exec(data, thread_active);
   }
 
-
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -150,65 +145,57 @@ struct CudaStatementListExecutorHelper {
   }
 };
 
-template <camp::idx_t num_stmts, typename StmtList>
-struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+template<camp::idx_t num_stmts, typename StmtList>
+struct CudaStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
 };
 
 
-template <typename Data, typename Policy, typename Types>
+template<typename Data, typename Policy, typename Types>
 struct CudaStatementExecutor;
 
-template <typename Data, typename StmtList, typename Types>
+template<typename Data, typename StmtList, typename Types>
 struct CudaStatementListExecutor;
 
-
-template <typename Data, typename... Stmts, typename Types>
-struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+template<typename Data, typename... Stmts, typename Types>
+struct CudaStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<CudaStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return CudaStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return CudaStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
-
-template <typename StmtList, typename Data, typename Types>
-using cuda_statement_list_executor_t = CudaStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+template<typename StmtList, typename Data, typename Types>
+using cuda_statement_list_executor_t =
+    CudaStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
@@ -217,268 +204,356 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(len));
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
+
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::Direct,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims), CudaDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& RAJA_UNUSED_ARG(dims),
+                             CudaDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(len));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper = cuda::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -486,63 +561,93 @@ struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapp
     }
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_cuda_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_cuda_dim<dim>(dims.threads,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      cuda::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT len)
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(dims.blocks,
+                      RAJA_DIVIDE_CEILING_INT(
+                          len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
+
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::cuda::cuda_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = cuda::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(CudaDims& dims, CudaDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(CudaDims& dims,
+                             CudaDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_cuda_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_cuda_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_cuda_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_cuda_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_cuda_dim<dim>(min_dims.threads,
+                      static_cast<IdxT>(IndexMapper::block_size));
+    set_cuda_dim<dim>(min_dims.blocks,
+                      static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
index 75e5f6902b..15666e42d9 100644
--- a/include/RAJA/policy/cuda/launch.hpp
+++ b/include/RAJA/policy/cuda/launch.hpp
@@ -28,55 +28,64 @@
 namespace RAJA
 {
 
-template <typename BODY>
+template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+template<typename BODY, typename ReduceParams>
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
-template <bool async>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>> {
+template<bool async>
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>>
+{
 
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -84,18 +93,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +116,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,13 +134,18 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -140,46 +158,54 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
 
       {
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usage::unspecified, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<
+            async, named_usage::unspecified, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,56 +213,65 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, named_usa
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
-
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM>
+template<typename BODY, int num_threads, size_t BLOCKS_PER_SM>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, size_t BLOCKS_PER_SM, typename ReduceParams>
+template<typename BODY,
+         int num_threads,
+         size_t BLOCKS_PER_SM,
+         typename ReduceParams>
 __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::cuda_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
-template <bool async, int nthreads, size_t BLOCKS_PER_SM>
-struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>> {
+template<bool async, int nthreads, size_t BLOCKS_PER_SM>
+struct LaunchExecute<
+    RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>>
+{
 
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     using BODY = camp::decay<BODY_IN>;
@@ -250,18 +285,20 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(params.teams.value[0]),
+    cuda_dim_t gridSize {static_cast<cuda_dim_member_t>(params.teams.value[0]),
                          static_cast<cuda_dim_member_t>(params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(params.teams.value[2]) };
+                         static_cast<cuda_dim_member_t>(params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -271,14 +308,16 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -287,19 +326,25 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     return resources::EventProxy<resources::Resource>(res);
   }
 
-  //Version with explicit reduction parameters..
+  // Version with explicit reduction parameters..
   template<typename BODY_IN, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN && body_in, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
 
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, BLOCKS_PER_SM,
+                                            camp::decay<ReduceParams>>);
 
     resources::Cuda cuda_res = res.get<RAJA::resources::Cuda>();
 
@@ -307,97 +352,111 @@ struct LaunchExecute<RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
     // Compute the number of blocks and threads
     //
 
-    cuda_dim_t gridSize{ static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
-                         static_cast<cuda_dim_member_t>(launch_params.teams.value[2]) };
+    cuda_dim_t gridSize {
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.teams.value[2])};
 
-    cuda_dim_t blockSize{ static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
-                          static_cast<cuda_dim_member_t>(launch_params.threads.value[2]) };
+    cuda_dim_t blockSize {
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<cuda_dim_member_t>(launch_params.threads.value[2])};
 
 
     // Only launch kernel if we have something to iterate over
     constexpr cuda_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::cuda::detail::cudaInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = cuda_res;
+      launch_info.res          = cuda_res;
       {
 
-        using EXEC_POL = RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads, BLOCKS_PER_SM>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::cuda::cuda_launch_explicit_t<async, nthreads,
+                                                       BLOCKS_PER_SM>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::cuda::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, cuda_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::cuda::make_launch_body(
+            func, gridSize, blockSize, shared_mem_size, cuda_res,
+            std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size, cuda_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::cuda::launch(func, gridSize, blockSize, args, shared_mem_size,
+                           cuda_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
     }
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
-
 /*
    CUDA generic loop implementations
 */
-template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+template<typename SEGMENT, typename IndexMapper>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -405,29 +464,36 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -437,53 +503,62 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Dir
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -494,34 +569,42 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -535,14 +618,16 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -550,43 +635,51 @@ struct LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Str
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1>,
+    SEGMENT>
+{
 
-  template <typename BODY>
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -594,31 +687,36 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper0,
+                                     IndexMapper1,
+                                     IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -628,54 +726,62 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -686,35 +792,42 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -728,51 +841,54 @@ struct LoopICountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mappin
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
   }
 };
 
-
 /*
    CUDA generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             sync,
+                                             IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::
+              cuda_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -781,29 +897,35 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                             kernel_sync_requirement::none,
+                                             IndexMapper0,
+                                             IndexMapper1,
+                                             IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -814,39 +936,47 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::cuda::cuda_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -856,29 +986,34 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::cuda::cuda_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -890,114 +1025,134 @@ struct LoopExecute<RAJA::policy::cuda::cuda_flatten_indexer<RAJA::iteration_mapp
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-
 /*
    CUDA generic tile implementations
 */
-template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::cuda::cuda_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<
+    RAJA::policy::cuda::cuda_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/cuda/multi_reduce.hpp b/include/RAJA/policy/cuda/multi_reduce.hpp
index f9f60f730e..d70fac27a8 100644
--- a/include/RAJA/policy/cuda/multi_reduce.hpp
+++ b/include/RAJA/policy/cuda/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -73,100 +73,123 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template<typename Combiner,
+         typename GetTallyIndex,
+         typename T,
+         typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int RAJA_UNUSED_ARG(num_bins),
+    T identity,
+    int bin,
+    T value,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
-
 //! initialize shared memory
-template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+template<typename T>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template<typename Combiner,
+         typename GetSharedIndex,
+         typename T,
+         typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int num_bins,
+    T identity,
+    int bin,
+    T value,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::cuda::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::cuda::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template<typename Combiner,
+         typename T,
+         typename GetSharedOffset,
+         typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::cuda::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::cuda::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -181,52 +204,66 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bi
 //
 
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template<typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
-
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()     = default;
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template<typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -239,39 +276,45 @@ struct MultiReduceGridAtomicHostInit_TallyData
     destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
-
   //! get value for bin, assumes synchronization occurred elsewhere
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
   }
 
-
   int num_bins() const { return m_num_bins; }
 
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
-
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +324,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template<typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +375,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,60 +398,54 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
-
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
-
+  void teardown_launch() {}
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
-
+  void finalize_device() {}
 
   //! combine value on device, combine a value into the tally atomically
   RAJA_DEVICE
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,78 +455,89 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
-
 //! MultiReduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template<typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()     = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,19 +546,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
-
   //! setup on device, initialize shared memory
   RAJA_DEVICE
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,30 +566,31 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
-
   //! combine value on device, combine a value into shared memory
   RAJA_DEVICE
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +601,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +618,26 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
-
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -576,7 +645,6 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   }
 };
 
-
 /*!
  **************************************************************************
  *
@@ -595,39 +663,49 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template<typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataCuda
 {
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          cuda::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                            T,
+                                                            tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              cuda::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                       T,
+                                                       tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Cuda>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataCuda() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr >
+  template<typename Container,
+           std::enable_if_t<
+               !std::is_same<Container, MultiReduceDataCuda>::value>* = nullptr>
   MultiReduceDataCuda(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +717,35 @@ struct MultiReduceDataCuda
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataCuda(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda(MultiReduceDataCuda&&)                 = delete;
   MultiReduceDataCuda& operator=(MultiReduceDataCuda const&) = delete;
-  MultiReduceDataCuda& operator=(MultiReduceDataCuda &&) = delete;
+  MultiReduceDataCuda& operator=(MultiReduceDataCuda&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,38 +753,43 @@ struct MultiReduceDataCuda
   ~MultiReduceDataCuda()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
 #endif
   }
 
-
-  template < typename Container >
+  template<typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
     m_data.reset_permanent(container, identity);
   }
 
-
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
   void combine(int bin, T const& value)
@@ -714,7 +801,6 @@ struct MultiReduceDataCuda
 #endif
   }
 
-
   //! map result value back to host if not done already; return aggregate value
   T get(int bin)
   {
@@ -722,22 +808,23 @@ struct MultiReduceDataCuda
     return m_data.get(bin);
   }
 
-
   size_t num_bins() const { return m_data.num_bins(); }
 
   T identity() const { return m_data.identity(); }
 
 
 private:
-  MultiReduceDataCuda const *m_parent;
+  MultiReduceDataCuda const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Cuda res)
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +833,8 @@ struct MultiReduceDataCuda
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Cuda& list_res : *m_sync_list) {
+    for (resources::Cuda& list_res : *m_sync_list)
+    {
       ::RAJA::cuda::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +843,8 @@ struct MultiReduceDataCuda
 
 }  // end namespace cuda
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy, cuda::MultiReduceDataCuda)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::cuda::cuda_multi_reduce_policy,
+                                cuda::MultiReduceDataCuda)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/cuda/params/kernel_name.hpp b/include/RAJA/policy/cuda/params/kernel_name.hpp
index 4edf645ed3..382f72b541 100644
--- a/include/RAJA/policy/cuda/params/kernel_name.hpp
+++ b/include/RAJA/policy/cuda/params/kernel_name.hpp
@@ -7,42 +7,48 @@
 #include "RAJA/policy/cuda/MemUtils_CUDA.hpp"
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::cuda::detail::cudaInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>> init(
+    KernelName& kn,
+    const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePush(kn.name);
+  nvtxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::cuda::detail::cudaInfo &)
-  {
+}
+
+// Combine
+template<typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_cuda_policy<EXEC_POL>>
+combine(KernelName&)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>> resolve(
+    KernelName&,
+    const RAJA::cuda::detail::cudaInfo&)
+{
 #if defined(RAJA_ENABLE_NV_TOOLS_EXT)
-    nvtxRangePop();
+  nvtxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/params/reduce.hpp b/include/RAJA/policy/cuda/params/reduce.hpp
index 6ab3372aaa..126eb87463 100644
--- a/include/RAJA/policy/cuda/params/reduce.hpp
+++ b/include/RAJA/policy/cuda/params/reduce.hpp
@@ -10,54 +10,59 @@
 
 #include "RAJA/policy/cuda/policy.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    red.devicetarget = RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
-    red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_cuda_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::cuda::detail::cudaInfo& ci)
-  {
-    // complete reduction
-    ci.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>> init(
+    Reducer<OP, T, VOp>& red,
+    RAJA::cuda::detail::cudaInfo& ci)
+{
+  red.devicetarget =
+      RAJA::cuda::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(ci.gridDim.x * ci.gridDim.y * ci.gridDim.z);
+  red.device_count = RAJA::cuda::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    type_traits::is_cuda_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::cuda::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_cuda_policy<EXEC_POL>> resolve(
+    Reducer<OP, T, VOp>& red,
+    RAJA::cuda::detail::cudaInfo& ci)
+{
+  // complete reduction
+  ci.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::cuda::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::cuda::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_CUDA_REDUCE_HPP
+#endif  //  NEW_REDUCE_CUDA_REDUCE_HPP
diff --git a/include/RAJA/policy/cuda/policy.hpp b/include/RAJA/policy/cuda/policy.hpp
index cd71a37480..865f0e095a 100644
--- a/include/RAJA/policy/cuda/policy.hpp
+++ b/include/RAJA/policy/cuda/policy.hpp
@@ -60,13 +60,15 @@ using cuda_dim_member_t = camp::decay<decltype(std::declval<cuda_dim_t>().x)>;
 
 namespace detail
 {
-template <bool Async>
-struct get_launch {
+template<bool Async>
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
-template <>
-struct get_launch<false> {
+template<>
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -78,7 +80,7 @@ namespace cuda
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template<typename... indexers>
 struct IndexFlatten;
 
 template<size_t divisor, typename index>
@@ -87,7 +89,6 @@ struct IndexDivide;
 template<size_t divisor, typename index>
 struct IndexModulo;
 
-
 /*!
  * Use the max occupancy of a kernel on the current device when launch
  * parameters are not fully determined.
@@ -96,13 +97,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -115,26 +117,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template<typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -148,34 +155,38 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template<typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
 
-
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template<size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -187,19 +198,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template<size_t t_cutoff,
+         size_t preferred_replication_before_cutoff,
+         size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -210,19 +225,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template<typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -231,18 +248,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template<typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -260,14 +279,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template<reduce_algorithm t_algorithm,
+         block_communication_mode t_comm_mode,
+         size_t t_replication,
+         size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -279,25 +300,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template<typename t_AtomicReplicationConcretizer,
+         typename t_ReplicationIndexer,
+         typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template<multi_reduce_algorithm t_algorithm,
+         typename t_SharedAtomicReplicationTuning,
+         typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace cuda
@@ -312,25 +333,29 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
 // Operations in the included files are parametrized using the following
 // values for CUDA warp size and max block size.
 //
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
@@ -338,40 +363,51 @@ static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
 constexpr const size_t MIN_BLOCKS_PER_SM = 1;
 constexpr const size_t MAX_BLOCKS_PER_SM = 32;
 
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_indexer {};
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct cuda_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::cuda,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::cuda> {
+template<typename _IterationMapping,
+         kernel_sync_requirement sync,
+         typename... _IterationGetters>
+struct cuda_indexer
+{};
+
+template<typename _IterationMapping,
+         kernel_sync_requirement sync,
+         typename... _IterationGetters>
+struct cuda_flatten_indexer
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<true /*async */>::value,
+          RAJA::Platform::cuda>
+{
   using IterationGetter = RAJA::cuda::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template<typename _IterationMapping,
+         typename _IterationGetter,
+         typename _LaunchConcretizer,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+         bool Async           = false>
 struct cuda_exec_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
-  using LaunchConcretizer = _LaunchConcretizer;
-};
-
-template <bool Async, int num_threads = named_usage::unspecified,
-          size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform_t<
                                 RAJA::Policy::cuda,
-                                RAJA::Pattern::region,
+                                RAJA::Pattern::forall,
                                 detail::get_launch<Async>::value,
-                                RAJA::Platform::cuda> {
+                                RAJA::Platform::cuda>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
+  using LaunchConcretizer = _LaunchConcretizer;
 };
 
+template<bool Async,
+         int num_threads      = named_usage::unspecified,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+struct cuda_launch_explicit_t
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::region,
+          detail::get_launch<Async>::value,
+          RAJA::Platform::cuda>
+{};
 
 //
 // NOTE: There is no Index set segment iteration policy for CUDA
@@ -380,13 +416,15 @@ struct cuda_launch_explicit_t : public RAJA::make_policy_pattern_launch_platform
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+         bool Async           = false>
 struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::cuda> {
-};
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::workgroup_exec,
+                                detail::get_launch<Async>::value,
+                                RAJA::Platform::cuda>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -394,11 +432,10 @@ struct cuda_work_explicit : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_cuda_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::cuda,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::cuda> {
-};
-
+          RAJA::Policy::cuda,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::cuda>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -408,36 +445,36 @@ struct unordered_cuda_loop_y_block_iter_x_threadblock_average
 ///////////////////////////////////////////////////////////////////////
 ///
 
-template < typename tuning >
-struct cuda_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
-
-template < typename tuning >
+template<typename tuning>
+struct cuda_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                                RAJA::Policy::cuda,
+                                RAJA::Pattern::reduce,
+                                detail::get_launch<false>::value,
+                                RAJA::Platform::cuda,
+                                std::conditional_t<tuning::consistent,
+                                                   reduce::ordered,
+                                                   reduce::unordered>>
+{};
+
+template<typename tuning>
 struct cuda_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::cuda,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::cuda,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::cuda,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::cuda,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Cuda atomic policy for using cuda atomics on the device and
  * the provided policy on the host
  */
 template<typename host_policy>
-struct cuda_atomic_explicit{};
+struct cuda_atomic_explicit
+{};
 
 /*!
  * Default cuda atomic policy uses cuda atomics on the device and non-atomics
@@ -445,27 +482,28 @@ struct cuda_atomic_explicit{};
  */
 using cuda_atomic = cuda_atomic_explicit<seq_atomic>;
 
-
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct cuda_block_reduce{};
+struct cuda_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct cuda_warp_reduce{};
+struct cuda_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_direct{};
+struct cuda_warp_direct
+{};
 
 // Policy to map work to threads within a warp using a warp-stride loop
 // Cannot be used in conjunction with cuda_thread_x_*
 // Multiple warps have to be created by using cuda_thread_{yz}_*
-struct cuda_warp_loop{};
-
-
+struct cuda_warp_loop
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -474,7 +512,8 @@ struct cuda_warp_loop{};
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
 template<typename Mask>
-struct cuda_warp_masked_direct {};
+struct cuda_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with cuda_thread_x_*
@@ -483,82 +522,84 @@ struct cuda_warp_masked_direct {};
 // cuda_warp_masked
 // can be used to create complex thread interleaving patterns
 template<typename Mask>
-struct cuda_warp_masked_loop {};
-
+struct cuda_warp_masked_loop
+{};
 
 template<typename Mask>
-struct cuda_thread_masked_direct {};
+struct cuda_thread_masked_direct
+{};
 
 template<typename Mask>
-struct cuda_thread_masked_loop {};
-
+struct cuda_thread_masked_loop
+{};
 
 struct cuda_synchronize : make_policy_pattern_launch_t<Policy::cuda,
                                                        Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                       Launch::sync>
+{};
 
 }  // end namespace cuda
 }  // end namespace policy
 
-
 namespace internal
 {
 
 RAJA_INLINE
 int get_size(cuda_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct CudaDims {
+struct CudaDims
+{
 
-  cuda_dim_t blocks{0,0,0};
-  cuda_dim_t threads{0,0,0};
+  cuda_dim_t blocks {0, 0, 0};
+  cuda_dim_t threads {0, 0, 0};
 
-  CudaDims() = default;
-  CudaDims(CudaDims const&) = default;
+  CudaDims()                           = default;
+  CudaDims(CudaDims const&)            = default;
   CudaDims& operator=(CudaDims const&) = default;
 
   RAJA_INLINE
   CudaDims(cuda_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  cuda_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  cuda_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  cuda_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  cuda_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
@@ -568,81 +609,69 @@ template<named_dim dim>
 struct CudaDimHelper;
 
 template<>
-struct CudaDimHelper<named_dim::x>{
+struct CudaDimHelper<named_dim::x>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.x = value;
   }
 };
 
 template<>
-struct CudaDimHelper<named_dim::y>{
+struct CudaDimHelper<named_dim::y>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.y = value;
   }
 };
 
 template<>
-struct CudaDimHelper<named_dim::z>{
+struct CudaDimHelper<named_dim::z>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  cuda_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr cuda_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, cuda_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, cuda_dim_member_t value)
   {
     d.z = value;
   }
 };
 
 template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-cuda_dim_member_t get_cuda_dim(dim_t const &d)
+RAJA_HOST_DEVICE constexpr cuda_dim_member_t get_cuda_dim(dim_t const& d)
 {
   return CudaDimHelper<dim>::get(d);
 }
 
 template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_cuda_dim(dim_t &d, cuda_dim_member_t value)
+RAJA_HOST_DEVICE void set_cuda_dim(dim_t& d, cuda_dim_member_t value)
 {
   return CudaDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace cuda
 {
@@ -651,14 +680,14 @@ namespace cuda
 struct IndexSize
 {
   cuda_dim_member_t block_size = named_usage::unspecified;
-  cuda_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(cuda_dim_member_t _block_size = named_usage::unspecified,
-            cuda_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  cuda_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      cuda_dim_member_t _block_size = named_usage::unspecified,
+      cuda_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size),
+        grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -673,23 +702,25 @@ struct IndexGlobal
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed block size of 1 and fixed grid size
 template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
@@ -697,20 +728,22 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed block size and fixed grid size of 1
 template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
@@ -718,37 +751,39 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
+
 /// with fixed block size and fixed grid size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
@@ -759,40 +794,47 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
+
 /// with dynamic block size and fixed grid size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
@@ -803,40 +845,46 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
+
 /// with fixed block size of 1 and dynamic grid size
 template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -845,21 +893,26 @@ template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(
+               ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -871,56 +924,61 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed grid sized of 1
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
+
 /// with dynamic grid size
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -932,56 +990,61 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
+
 /// with fixed block size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
+
 /// with dynamic block size
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::CudaDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::CudaDimHelper<dim>::get(blockDim));
   }
 };
 
@@ -991,18 +1054,18 @@ template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
@@ -1011,19 +1074,18 @@ template<typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
@@ -1031,21 +1093,19 @@ template<typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
@@ -1053,49 +1113,52 @@ template<typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
 template<size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
 template<size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = cuda_dim_member_t >
+  template<typename IdxT = cuda_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1104,16 +1167,18 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template<typename index_global>
 struct get_index_thread;
+
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 };
+
 ///
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
   using type = IndexFlatten<typename get_index_thread<x_index>::type,
@@ -1122,16 +1187,18 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template<typename index_global>
 struct get_index_block;
+
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 };
+
 ///
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
   using type = IndexFlatten<typename get_index_block<x_index>::type,
@@ -1139,86 +1206,83 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
                             typename get_index_block<z_index>::type>;
 };
 
-
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template<size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template<size_t GRID_SIZE_X = named_usage::unspecified,
+         size_t GRID_SIZE_Y = named_usage::unspecified,
+         size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <size_t BLOCK_SIZE_X,
-          size_t BLOCK_SIZE_Y,
-          size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template<size_t BLOCK_SIZE_X,
+         size_t BLOCK_SIZE_Y,
+         size_t BLOCK_SIZE_Z,
+         size_t GRID_SIZE_X = named_usage::unspecified,
+         size_t GRID_SIZE_Y = named_usage::unspecified,
+         size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::cuda::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace cuda
+template<size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+         size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template<size_t WARP_SIZE    = RAJA::policy::cuda::device_constants.WARP_SIZE,
+         size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified,
+         size_t GRID_SIZE_X  = named_usage::unspecified,
+         size_t GRID_SIZE_Y  = named_usage::unspecified,
+         size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace cuda
 
 // contretizers used in forall, scan, and sort policies
 
-using CudaAvoidDeviceMaxThreadOccupancyConcretizer = cuda::AvoidDeviceMaxThreadOccupancyConcretizer<cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using CudaAvoidDeviceMaxThreadOccupancyConcretizer =
+    cuda::AvoidDeviceMaxThreadOccupancyConcretizer<
+        cuda::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using CudaFractionOffsetOccupancyConcretizer = cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template<typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using CudaFractionOffsetOccupancyConcretizer =
+    cuda::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using CudaMaxOccupancyConcretizer = cuda::MaxOccupancyConcretizer;
 
@@ -1228,179 +1292,286 @@ using CudaDefaultConcretizer = CudaMaxOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template<size_t BLOCK_SIZE,
+         size_t GRID_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async = false>
 using cuda_exec_grid_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_grid_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using cuda_exec_grid = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE>
 using cuda_exec_grid_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
-
-template <size_t BLOCK_SIZE>
-using cuda_exec_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::Direct, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE, GRID_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+using cuda_exec_explicit =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     Async>;
+
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_explicit_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     BLOCKS_PER_SM,
+                                     true>;
+
+template<size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     Async>;
+
+template<size_t BLOCK_SIZE>
+using cuda_exec_async =
+    policy::cuda::cuda_exec_explicit<iteration_mapping::Direct,
+                                     cuda::global_x<BLOCK_SIZE>,
+                                     CudaDefaultConcretizer,
+                                     policy::cuda::MIN_BLOCKS_PER_SM,
+                                     true>;
+
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_calc_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_calc_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_calc = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using cuda_exec_occ_calc_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_occ_max_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_occ_max_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_occ_max = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using cuda_exec_occ_max_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaMaxOccupancyConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaMaxOccupancyConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         typename Fraction,
+         bool Async = false>
 using cuda_exec_occ_fraction_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Fraction>
 using cuda_exec_occ_fraction_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+template<size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using cuda_exec_occ_fraction = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, typename Fraction>
+template<size_t BLOCK_SIZE, typename Fraction>
 using cuda_exec_occ_fraction_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         typename Concretizer,
+         bool Async = false>
 using cuda_exec_occ_custom_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, typename Concretizer>
 using cuda_exec_occ_custom_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+template<size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using cuda_exec_occ_custom = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, typename Concretizer>
+template<size_t BLOCK_SIZE, typename Concretizer>
 using cuda_exec_occ_custom_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    Concretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    Concretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
 using cuda_exec_with_reduce_explicit = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+template<size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
 using cuda_exec_with_reduce_explicit_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, BLOCKS_PER_SM, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    BLOCKS_PER_SM,
+    true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 using cuda_exec_with_reduce = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using cuda_exec_with_reduce_async = policy::cuda::cuda_exec_explicit<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, cuda::global_x<BLOCK_SIZE>,
-    CudaReduceDefaultConcretizer, policy::cuda::MIN_BLOCKS_PER_SM, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM, bool Async = false>
-using cuda_exec_base_explicit = std::conditional_t<with_reduce,
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    cuda::global_x<BLOCK_SIZE>,
+    CudaReduceDefaultConcretizer,
+    policy::cuda::MIN_BLOCKS_PER_SM,
+    true>;
+
+template<bool with_reduce,
+         size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM,
+         bool Async = false>
+using cuda_exec_base_explicit = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>,
     cuda_exec_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>>;
 
-template <bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
-using cuda_exec_base_explicit_async = std::conditional_t<with_reduce,
+template<bool with_reduce, size_t BLOCK_SIZE, size_t BLOCKS_PER_SM>
+using cuda_exec_base_explicit_async = std::conditional_t<
+    with_reduce,
     cuda_exec_with_reduce_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>,
     cuda_exec_explicit_async<BLOCK_SIZE, BLOCKS_PER_SM>>;
 
-template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using cuda_exec_base = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce<BLOCK_SIZE, Async>,
-    cuda_exec<BLOCK_SIZE, Async>>;
+template<bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using cuda_exec_base =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce<BLOCK_SIZE, Async>,
+                       cuda_exec<BLOCK_SIZE, Async>>;
 
-template <bool with_reduce, size_t BLOCK_SIZE>
-using cuda_exec_base_async = std::conditional_t<with_reduce,
-    cuda_exec_with_reduce_async<BLOCK_SIZE>,
-    cuda_exec_async<BLOCK_SIZE>>;
+template<bool with_reduce, size_t BLOCK_SIZE>
+using cuda_exec_base_async =
+    std::conditional_t<with_reduce,
+                       cuda_exec_with_reduce_async<BLOCK_SIZE>,
+                       cuda_exec_async<BLOCK_SIZE>>;
 
 
 // policies usable with WorkGroup
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM, bool Async = false>
-using cuda_work_explicit = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM,
+         bool Async           = false>
+using cuda_work_explicit =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_work_explicit_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
+template<size_t BLOCK_SIZE,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_work_explicit_async =
+    policy::cuda::cuda_work_explicit<BLOCK_SIZE, BLOCKS_PER_SM, true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
-using cuda_work = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
+template<size_t BLOCK_SIZE, bool Async = false>
+using cuda_work = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, Async>;
 
-template <size_t BLOCK_SIZE>
-using cuda_work_async = policy::cuda::cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
+template<size_t BLOCK_SIZE>
+using cuda_work_async = policy::cuda::
+    cuda_work_explicit<BLOCK_SIZE, policy::cuda::MIN_BLOCKS_PER_SM, true>;
 
 using policy::cuda::unordered_cuda_loop_y_block_iter_x_threadblock_average;
 
@@ -1410,10 +1581,10 @@ using policy::cuda::cuda_atomic_explicit;
 
 
 // policies usable with reducers
-template < cuda::reduce_algorithm algorithm,
-           cuda::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template<cuda::reduce_algorithm algorithm,
+         cuda::block_communication_mode comm_mode,
+         size_t replication   = named_usage::unspecified,
+         size_t atomic_stride = named_usage::unspecified>
 using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
     cuda::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1436,35 +1607,41 @@ using cuda_reduce_tuning = policy::cuda::cuda_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using cuda_reduce_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::combine_last_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::combine_last_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_device_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_device_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_device_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_device_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_device_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_device_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::device_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 ///
-using cuda_reduce_atomic_host_init_block_fence = cuda_reduce_tuning<
-    cuda::reduce_algorithm::init_host_combine_atomic_block,
-    cuda::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using cuda_reduce_atomic_host_init_block_fence =
+    cuda_reduce_tuning<cuda::reduce_algorithm::init_host_combine_atomic_block,
+                       cuda::block_communication_mode::block_fence,
+                       named_usage::unspecified,
+                       named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1476,25 +1653,26 @@ using cuda_reduce_atomic = cuda_reduce_atomic_host_init_device_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using cuda_reduce_base = std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
+template<bool with_atomic>
+using cuda_reduce_base =
+    std::conditional_t<with_atomic, cuda_reduce_atomic, cuda_reduce>;
 
 
 // policies usable with multi_reducers
-template < cuda::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
-    cuda::MultiReduceTuning<
-      algorithm,
-      cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template<cuda::multi_reduce_algorithm algorithm,
+         typename SharedAtomicReplicationConcretizer,
+         typename SharedAtomicReplicationIndexer,
+         typename GlobalAtomicReplicationConcretizer,
+         typename GlobalAtomicReplicationIndexer>
+using cuda_multi_reduce_tuning =
+    policy::cuda::cuda_multi_reduce_policy<cuda::MultiReduceTuning<
+        algorithm,
+        cuda::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                      SharedAtomicReplicationIndexer,
+                                      GetOffsetRight<int>>,
+        cuda::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                      GlobalAtomicReplicationIndexer,
+                                      GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1508,44 +1686,51 @@ using cuda_multi_reduce_tuning = policy::cuda::cuda_multi_reduce_policy<
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the IBM power 9 + Nvidia V100 Sierra/Lassen
 //   systems.
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<16>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<16>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    cuda::SharedAtomicReplicationMaxPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<0>>,
-    cuda::thread_xyz<>,
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<2>>,
-    cuda::warp_global_xyz<>>;
+using cuda_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        cuda::SharedAtomicReplicationMaxPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<0>>,
+        cuda::thread_xyz<>,
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<2>>,
+        cuda::warp_global_xyz<>>;
 //
 using cuda_multi_reduce_atomic_global_host_init = cuda_multi_reduce_tuning<
     cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     cuda::GlobalAtomicReplicationMinPow2Concretizer<
         cuda::ConstantPreferredReplicationConcretizer<2>>,
     cuda::warp_global_xyz<>>;
 //
-using cuda_multi_reduce_atomic_global_no_replication_host_init = cuda_multi_reduce_tuning<
-    cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    cuda::GlobalAtomicReplicationMinPow2Concretizer<
-        cuda::ConstantPreferredReplicationConcretizer<1>>,
-    cuda::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using cuda_multi_reduce_atomic = cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using cuda_multi_reduce_atomic_global_no_replication_host_init =
+    cuda_multi_reduce_tuning<
+        cuda::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        cuda::GlobalAtomicReplicationMinPow2Concretizer<
+            cuda::ConstantPreferredReplicationConcretizer<1>>,
+        cuda::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using cuda_multi_reduce_atomic =
+    cuda_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using cuda_multi_reduce_atomic_low_performance_low_overhead =
     cuda_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1573,41 +1758,49 @@ using policy::cuda::cuda_thread_masked_loop;
 using policy::cuda::cuda_synchronize;
 
 // policies usable with launch
-template <bool Async, int num_threads = named_usage::unspecified, size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
-using cuda_launch_explicit_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
-
-//CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
-template <bool Async, int num_threads = named_usage::unspecified>
-using cuda_launch_t = policy::cuda::cuda_launch_explicit_t<Async, num_threads,
-    (num_threads == named_usage::unspecified) ? named_usage::unspecified : policy::cuda::MIN_BLOCKS_PER_SM>;
+template<bool Async,
+         int num_threads      = named_usage::unspecified,
+         size_t BLOCKS_PER_SM = policy::cuda::MIN_BLOCKS_PER_SM>
+using cuda_launch_explicit_t =
+    policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>;
+
+// CUDA will emit warnings if we specify BLOCKS_PER_SM but not num of threads
+template<bool Async, int num_threads = named_usage::unspecified>
+using cuda_launch_t =
+    policy::cuda::cuda_launch_explicit_t<Async,
+                                         num_threads,
+                                         (num_threads ==
+                                          named_usage::unspecified)
+                                             ? named_usage::unspecified
+                                             : policy::cuda::MIN_BLOCKS_PER_SM>;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using cuda_indexer_direct = policy::cuda::cuda_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template<typename... indexers>
+using cuda_indexer_direct =
+    policy::cuda::cuda_indexer<iteration_mapping::Direct,
+                               kernel_sync_requirement::none,
+                               indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using cuda_indexer_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using cuda_indexer_syncable_loop = policy::cuda::cuda_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using cuda_flatten_indexer_direct = policy::cuda::cuda_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template<typename... indexers>
+using cuda_flatten_indexer_direct =
+    policy::cuda::cuda_flatten_indexer<iteration_mapping::Direct,
+                                       kernel_sync_requirement::none,
+                                       indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1620,7 +1813,7 @@ using cuda_flatten_indexer_loop = policy::cuda::cuda_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_thread_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1635,22 +1828,28 @@ using cuda_thread_yz_direct = cuda_thread_direct<named_dim::y, named_dim::z>;
 using cuda_thread_zx_direct = cuda_thread_direct<named_dim::z, named_dim::x>;
 using cuda_thread_zy_direct = cuda_thread_direct<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_direct = cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_direct = cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_direct = cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_direct = cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_direct = cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_direct = cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_direct =
+    cuda_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_direct =
+    cuda_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_direct =
+    cuda_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_direct =
+    cuda_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_direct =
+    cuda_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_direct =
+    cuda_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_thread_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_thread_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1665,12 +1864,18 @@ using cuda_thread_yz_loop = cuda_thread_loop<named_dim::y, named_dim::z>;
 using cuda_thread_zx_loop = cuda_thread_loop<named_dim::z, named_dim::x>;
 using cuda_thread_zy_loop = cuda_thread_loop<named_dim::z, named_dim::y>;
 
-using cuda_thread_xyz_loop = cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_thread_xzy_loop = cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_thread_yxz_loop = cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_thread_yzx_loop = cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_thread_zxy_loop = cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_thread_xyz_loop =
+    cuda_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_thread_xzy_loop =
+    cuda_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_thread_yxz_loop =
+    cuda_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_thread_yzx_loop =
+    cuda_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_thread_zxy_loop =
+    cuda_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_thread_zyx_loop =
+    cuda_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
@@ -1678,7 +1883,7 @@ using cuda_thread_zyx_loop = cuda_thread_loop<named_dim::z, named_dim::y, named_
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_flatten_thread_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1686,26 +1891,38 @@ using cuda_flatten_thread_x_direct = cuda_flatten_thread_direct<named_dim::x>;
 using cuda_flatten_thread_y_direct = cuda_flatten_thread_direct<named_dim::y>;
 using cuda_flatten_thread_z_direct = cuda_flatten_thread_direct<named_dim::z>;
 
-using cuda_flatten_thread_xy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_direct = cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_direct = cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_direct = cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_direct =
+    cuda_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_direct =
+    cuda_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_direct =
+    cuda_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_flatten_thread_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1713,19 +1930,31 @@ using cuda_flatten_thread_x_loop = cuda_flatten_thread_loop<named_dim::x>;
 using cuda_flatten_thread_y_loop = cuda_flatten_thread_loop<named_dim::y>;
 using cuda_flatten_thread_z_loop = cuda_flatten_thread_loop<named_dim::z>;
 
-using cuda_flatten_thread_xy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_thread_xz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_thread_yz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_thread_zx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_thread_xyz_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_thread_xzy_loop = cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_thread_yxz_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_thread_yzx_loop = cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_thread_zxy_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_thread_xy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_thread_xz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_thread_yz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_thread_zx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_thread_xyz_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_thread_xzy_loop =
+    cuda_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_thread_yxz_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_thread_yzx_loop =
+    cuda_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_thread_zxy_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_thread_zyx_loop =
+    cuda_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1733,7 +1962,7 @@ using cuda_flatten_thread_zyx_loop = cuda_flatten_thread_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_block_direct = cuda_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1748,22 +1977,28 @@ using cuda_block_yz_direct = cuda_block_direct<named_dim::y, named_dim::z>;
 using cuda_block_zx_direct = cuda_block_direct<named_dim::z, named_dim::x>;
 using cuda_block_zy_direct = cuda_block_direct<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_direct = cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_direct = cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_direct = cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_direct = cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_direct = cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_direct = cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_direct =
+    cuda_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_direct =
+    cuda_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_direct =
+    cuda_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_direct =
+    cuda_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_direct =
+    cuda_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_direct =
+    cuda_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_block_loop = cuda_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_block_syncable_loop = cuda_indexer_syncable_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1778,12 +2013,18 @@ using cuda_block_yz_loop = cuda_block_loop<named_dim::y, named_dim::z>;
 using cuda_block_zx_loop = cuda_block_loop<named_dim::z, named_dim::x>;
 using cuda_block_zy_loop = cuda_block_loop<named_dim::z, named_dim::y>;
 
-using cuda_block_xyz_loop = cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_block_xzy_loop = cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_block_yxz_loop = cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_block_yzx_loop = cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_block_zxy_loop = cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_block_xyz_loop =
+    cuda_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_block_xzy_loop =
+    cuda_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_block_yxz_loop =
+    cuda_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_block_yzx_loop =
+    cuda_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_block_zxy_loop =
+    cuda_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_block_zyx_loop =
+    cuda_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
@@ -1791,7 +2032,7 @@ using cuda_block_zyx_loop = cuda_block_loop<named_dim::z, named_dim::y, named_di
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_flatten_block_direct = cuda_flatten_indexer_direct<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1799,26 +2040,38 @@ using cuda_flatten_block_x_direct = cuda_flatten_block_direct<named_dim::x>;
 using cuda_flatten_block_y_direct = cuda_flatten_block_direct<named_dim::y>;
 using cuda_flatten_block_z_direct = cuda_flatten_block_direct<named_dim::z>;
 
-using cuda_flatten_block_xy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_direct = cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_direct = cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_direct = cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_direct = cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_direct = cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_direct = cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_direct =
+    cuda_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_direct =
+    cuda_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_direct =
+    cuda_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using cuda_flatten_block_loop = cuda_flatten_indexer_loop<
     cuda::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1826,19 +2079,31 @@ using cuda_flatten_block_x_loop = cuda_flatten_block_loop<named_dim::x>;
 using cuda_flatten_block_y_loop = cuda_flatten_block_loop<named_dim::y>;
 using cuda_flatten_block_z_loop = cuda_flatten_block_loop<named_dim::z>;
 
-using cuda_flatten_block_xy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_block_xz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_block_yx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_block_yz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_block_zx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_block_zy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_block_xyz_loop = cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_block_xzy_loop = cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_block_yxz_loop = cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_block_yzx_loop = cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_block_zxy_loop = cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_block_xy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_block_xz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_block_yx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_block_yz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_block_zx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_block_zy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_block_xyz_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_block_xzy_loop =
+    cuda_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_block_yxz_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_block_yzx_loop =
+    cuda_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_block_zxy_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_block_zyx_loop =
+    cuda_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1846,9 +2111,11 @@ using cuda_flatten_block_zyx_loop = cuda_flatten_block_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using cuda_global_direct = cuda_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using cuda_global_direct =
+    cuda_indexer_direct<cuda::IndexGlobal<dims,
+                                          named_usage::unspecified,
+                                          named_usage::unspecified>...>;
 
 using cuda_global_x_direct = cuda_global_direct<named_dim::x>;
 using cuda_global_y_direct = cuda_global_direct<named_dim::y>;
@@ -1861,24 +2128,34 @@ using cuda_global_yz_direct = cuda_global_direct<named_dim::y, named_dim::z>;
 using cuda_global_zx_direct = cuda_global_direct<named_dim::z, named_dim::x>;
 using cuda_global_zy_direct = cuda_global_direct<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_direct = cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_direct = cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_direct = cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_direct = cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_direct = cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_direct = cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_direct =
+    cuda_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_direct =
+    cuda_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_direct =
+    cuda_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_direct =
+    cuda_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_direct =
+    cuda_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_direct =
+    cuda_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using cuda_global_loop = cuda_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using cuda_global_syncable_loop = cuda_indexer_syncable_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using cuda_global_loop =
+    cuda_indexer_loop<cuda::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
+
+template<named_dim... dims>
+using cuda_global_syncable_loop =
+    cuda_indexer_syncable_loop<cuda::IndexGlobal<dims,
+                                                 named_usage::unspecified,
+                                                 named_usage::unspecified>...>;
 
 using cuda_global_x_loop = cuda_global_loop<named_dim::x>;
 using cuda_global_y_loop = cuda_global_loop<named_dim::y>;
@@ -1891,12 +2168,18 @@ using cuda_global_yz_loop = cuda_global_loop<named_dim::y, named_dim::z>;
 using cuda_global_zx_loop = cuda_global_loop<named_dim::z, named_dim::x>;
 using cuda_global_zy_loop = cuda_global_loop<named_dim::z, named_dim::y>;
 
-using cuda_global_xyz_loop = cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_global_xzy_loop = cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_global_yxz_loop = cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_global_yzx_loop = cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_global_zxy_loop = cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_global_xyz_loop =
+    cuda_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_global_xzy_loop =
+    cuda_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_global_yxz_loop =
+    cuda_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_global_yzx_loop =
+    cuda_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_global_zxy_loop =
+    cuda_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_global_zyx_loop =
+    cuda_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -1904,54 +2187,83 @@ using cuda_global_zyx_loop = cuda_global_loop<named_dim::z, named_dim::y, named_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using cuda_flatten_global_direct = cuda_flatten_indexer_direct<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using cuda_flatten_global_direct =
+    cuda_flatten_indexer_direct<cuda::IndexGlobal<dims,
+                                                  named_usage::unspecified,
+                                                  named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_direct = cuda_flatten_global_direct<named_dim::x>;
 using cuda_flatten_global_y_direct = cuda_flatten_global_direct<named_dim::y>;
 using cuda_flatten_global_z_direct = cuda_flatten_global_direct<named_dim::z>;
 
-using cuda_flatten_global_xy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_direct = cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_direct = cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_direct = cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_direct = cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_direct = cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_direct = cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_direct =
+    cuda_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_direct =
+    cuda_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_direct =
+    cuda_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using cuda_flatten_global_loop = cuda_flatten_indexer_loop<
-    cuda::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using cuda_flatten_global_loop =
+    cuda_flatten_indexer_loop<cuda::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using cuda_flatten_global_x_loop = cuda_flatten_global_loop<named_dim::x>;
 using cuda_flatten_global_y_loop = cuda_flatten_global_loop<named_dim::y>;
 using cuda_flatten_global_z_loop = cuda_flatten_global_loop<named_dim::z>;
 
-using cuda_flatten_global_xy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y>;
-using cuda_flatten_global_xz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z>;
-using cuda_flatten_global_yx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x>;
-using cuda_flatten_global_yz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z>;
-using cuda_flatten_global_zx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x>;
-using cuda_flatten_global_zy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using cuda_flatten_global_xyz_loop = cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using cuda_flatten_global_xzy_loop = cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using cuda_flatten_global_yxz_loop = cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using cuda_flatten_global_yzx_loop = cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using cuda_flatten_global_zxy_loop = cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using cuda_flatten_global_xy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y>;
+using cuda_flatten_global_xz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z>;
+using cuda_flatten_global_yx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x>;
+using cuda_flatten_global_yz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z>;
+using cuda_flatten_global_zx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x>;
+using cuda_flatten_global_zy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using cuda_flatten_global_xyz_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using cuda_flatten_global_xzy_loop =
+    cuda_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using cuda_flatten_global_yxz_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using cuda_flatten_global_yzx_loop =
+    cuda_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using cuda_flatten_global_zxy_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using cuda_flatten_global_zyx_loop =
+    cuda_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1959,271 +2271,481 @@ using cuda_flatten_global_zyx_loop = cuda_flatten_global_loop<named_dim::z, name
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using cuda_thread_size_x_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_thread_size_y_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_thread_size_z_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_direct = cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_direct = cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_direct = cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_block_size_x_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_block_size_y_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_block_size_z_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_direct = cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_direct = cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_direct = cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_direct = cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_direct = cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_direct = cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_BLOCK_SIZE>
+using cuda_thread_size_x_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using cuda_thread_size_y_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using cuda_thread_size_z_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_direct =
+    cuda_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_direct =
+    cuda_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_direct =
+    cuda_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                        cuda::thread_y<Y_BLOCK_SIZE>,
+                        cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
+using cuda_block_size_x_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE>
+using cuda_block_size_y_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE>
+using cuda_block_size_z_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_direct =
+    cuda_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_direct =
+    cuda_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_direct =
+    cuda_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                        cuda::block_y<Y_GRID_SIZE>,
+                        cuda::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_direct =
+    cuda_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_direct =
+    cuda_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_direct =
+    cuda_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                        cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                        cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to CUDA global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template<int X_BLOCK_SIZE>
 using cuda_thread_size_x_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template<int Y_BLOCK_SIZE>
 using cuda_thread_size_y_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template<int Z_BLOCK_SIZE>
 using cuda_thread_size_z_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_xyz_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_xzy_loop = cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_thread_size_yxz_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_yzx_loop = cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_thread_size_zxy_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_thread_size_zyx_loop = cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_xyz_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_xzy_loop =
+    cuda_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_thread_size_yxz_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_yzx_loop =
+    cuda_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_thread_size_zxy_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_thread_size_zyx_loop =
+    cuda_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                      cuda::thread_y<Y_BLOCK_SIZE>,
+                      cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
 using cuda_block_size_x_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template<int Y_GRID_SIZE>
 using cuda_block_size_y_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template<int Z_GRID_SIZE>
 using cuda_block_size_z_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_x_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_y_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_z_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xyz_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_xzy_loop = cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yxz_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_yzx_loop = cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zxy_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xy_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xz_loop =
+    cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yx_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yz_loop =
+    cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zx_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zy_loop =
+    cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_xyz_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_xzy_loop = cuda_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_block_size_yxz_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_yzx_loop = cuda_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_block_size_zxy_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_block_size_zyx_loop = cuda_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                                                   cuda::block_y<Y_GRID_SIZE>,
+                                                   cuda::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_x_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_y_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_z_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xyz_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_xzy_loop =
+    cuda_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yxz_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_yzx_loop =
+    cuda_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zxy_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_global_size_zyx_loop =
+    cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
@@ -2231,272 +2753,507 @@ using cuda_global_size_zyx_loop = cuda_indexer_loop<cuda::global_z<Z_BLOCK_SIZE,
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_direct = cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_direct = cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_direct = cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_direct = cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_direct = cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_direct = cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_direct = cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_direct = cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_direct = cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::thread_z<Z_BLOCK_SIZE>,
+                                cuda::thread_y<Y_BLOCK_SIZE>,
+                                cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
+using cuda_flatten_block_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::block_z<Z_GRID_SIZE>,
+                                cuda::block_y<Y_GRID_SIZE>,
+                                cuda::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_direct =
+    cuda_flatten_indexer_direct<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_direct =
+    cuda_flatten_indexer_direct<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_direct =
+    cuda_flatten_indexer_direct<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                                cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                                cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened CUDA global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_x_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_y_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_z_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_xyz_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_xzy_loop = cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using cuda_flatten_thread_size_yxz_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_yzx_loop = cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using cuda_flatten_thread_size_zxy_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using cuda_flatten_thread_size_zyx_loop = cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>, cuda::thread_y<Y_BLOCK_SIZE>, cuda::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using cuda_flatten_block_size_x_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using cuda_flatten_block_size_y_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using cuda_flatten_block_size_z_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_xyz_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_xzy_loop = cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using cuda_flatten_block_size_yxz_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_yzx_loop = cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>, cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using cuda_flatten_block_size_zxy_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using cuda_flatten_block_size_zyx_loop = cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>, cuda::block_y<Y_GRID_SIZE>, cuda::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_x_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_y_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_z_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xyz_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_xzy_loop = cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yxz_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_yzx_loop = cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zxy_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using cuda_flatten_global_size_zyx_loop = cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using cuda_flatten_thread_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using cuda_flatten_thread_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using cuda_flatten_thread_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::thread_z<Z_BLOCK_SIZE>,
+                              cuda::thread_y<Y_BLOCK_SIZE>,
+                              cuda::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
+using cuda_flatten_block_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE>
+using cuda_flatten_block_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE>
+using cuda_flatten_block_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using cuda_flatten_block_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using cuda_flatten_block_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using cuda_flatten_block_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::block_z<Z_GRID_SIZE>,
+                              cuda::block_y<Y_GRID_SIZE>,
+                              cuda::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_x_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_y_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_z_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xyz_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_xzy_loop =
+    cuda_flatten_indexer_loop<cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yxz_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_yzx_loop =
+    cuda_flatten_indexer_loop<cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zxy_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using cuda_flatten_global_size_zyx_loop =
+    cuda_flatten_indexer_loop<cuda::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                              cuda::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                              cuda::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
index 409ec16818..1107f1f78a 100644
--- a/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
+++ b/include/RAJA/policy/cuda/raja_cudaerrchk.hpp
@@ -43,18 +43,20 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define cudaErrchk(ans)                            \
-  {                                                \
-    ::RAJA::cudaAssert((ans), __FILE__, __LINE__); \
+#define cudaErrchk(ans)                                                        \
+  {                                                                            \
+    ::RAJA::cudaAssert((ans), __FILE__, __LINE__);                             \
   }
 
 inline void cudaAssert(cudaError_t code,
-                       const char *file,
+                       const char* file,
                        int line,
                        bool abort = true)
 {
-  if (code != cudaSuccess) {
-    if (abort) {
+  if (code != cudaSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "CUDAassert: ";
       msg += cudaGetErrorString(code);
@@ -63,9 +65,11 @@ inline void cudaAssert(cudaError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "CUDAassert: %s %s %d\n",
-              cudaGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "CUDAassert: %s %s %d\n", cudaGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/cuda/reduce.hpp b/include/RAJA/policy/cuda/reduce.hpp
index 2b13417531..2d320aa376 100644
--- a/include/RAJA/policy/cuda/reduce.hpp
+++ b/include/RAJA/policy/cuda/reduce.hpp
@@ -44,9 +44,9 @@
 #include "RAJA/policy/cuda/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/cuda/atomic.hpp"
+#include "RAJA/policy/cuda/atomic.hpp"
 #endif
 
 #include "RAJA/policy/cuda/policy.hpp"
@@ -62,51 +62,57 @@ namespace cuda
 {
 
 //! atomic operator version of Combiner object
-template <typename Combiner>
+template<typename Combiner>
 struct atomic;
 
-template <typename T>
-struct atomic<sum<T>> {
+template<typename T>
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<min<T>> {
+template<typename T>
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<max<T>> {
+template<typename T>
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<and_bit<T>> {
+template<typename T>
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<or_bit<T>> {
+template<typename T>
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::cuda_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::cuda_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct cuda_atomic_available {
+template<typename T>
+struct cuda_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -124,15 +130,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template<typename Combiner,
+         typename Accessor,
+         int replication,
+         int atomic_stride,
+         typename T,
+         typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -143,20 +153,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -164,33 +176,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   isLastBlock = __syncthreads_or(isLastBlock);
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -198,72 +213,92 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
-template <typename ThreadIterationGetter, typename Combiner, typename T>
+template<typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::cuda::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::cuda::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::cuda::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::cuda::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::cuda::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::cuda::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <= RAJA::policy::cuda::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::cuda::device_constants.MAX_WARPS <=
+                    RAJA::policy::cuda::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::cuda::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-cuda
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char
+        tmpsd[sizeof(RAJA::detail::SoAArray<
+                     T, RAJA::policy::cuda::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::cuda::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::cuda::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::cuda::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::cuda::device_constants.MAX_WARPS;
+           i *= 2)
+      {
         T rhs = RAJA::cuda::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -273,68 +308,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-
-template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::cuda::device_mempool_type> device_mem,
-                                          unsigned int* device_count)
+template<typename GlobalIterationGetter, typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce(
+    T* device_target,
+    T val,
+    RAJA::detail::SoAPtr<T, RAJA::cuda::device_mempool_type> device_mem,
+    unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
-
+}  //  namespace expt
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride, typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template<typename Combiner,
+         typename Accessor,
+         int replication,
+         int atomic_stride,
+         typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T& val,
+    T identity,
+    T* device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -343,24 +387,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -371,19 +419,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -393,10 +444,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 }
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
-template <typename Combiner, int replication, int atomic_stride, typename T>
+template<typename Combiner, int replication, int atomic_stride, typename T>
 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+                                                          T identity,
+                                                          T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -404,14 +455,15 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::cuda::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::cuda::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
 }
 
@@ -419,17 +471,20 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T, size_t num_slots, typename mempool>
+template<typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
+
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Cuda res;
     Node* node_list;
@@ -482,14 +537,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -501,7 +561,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -538,25 +598,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Cuda res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Cuda res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -565,7 +627,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::cuda::synchronize(*r);
     }
   }
@@ -573,10 +636,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -605,12 +670,15 @@ class PinnedTally
 
 //! Reduction data for Cuda Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename Accessor,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -621,7 +689,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool owns_device_pointer;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){}
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {}
 
   /*! \brief create from a default value and offload information
    *
@@ -629,31 +697,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -665,10 +732,12 @@ struct ReduceLastBlock_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -678,13 +747,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
+    if (act)
+    {
       cuda_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -695,10 +766,11 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
@@ -706,8 +778,10 @@ struct ReduceLastBlock_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -719,32 +793,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool owns_device_pointer;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){};
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {};
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -756,9 +830,8 @@ struct ReduceAtomicHostInit_Data
   {
     T temp = value;
 
-    impl::grid_reduce_atomic_host_init<Combiner,
-        replication, atomic_stride>(
-            temp, identity, output);
+    impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -766,8 +839,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup            = true;
       owns_device_pointer = true;
     }
     return act;
@@ -778,8 +852,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup            = false;
       owns_device_pointer = false;
     }
     return act;
@@ -787,12 +862,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Cuda Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename Accessor,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -803,34 +881,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool owns_device_pointer;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){};
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {};
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        owns_device_pointer{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        owns_device_pointer {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        owns_device_pointer{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        owns_device_pointer {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -842,10 +920,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -855,10 +935,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       owns_device_pointer = true;
     }
     return act;
@@ -869,65 +952,93 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = owns_device_pointer;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count        = nullptr;
       owns_device_pointer = false;
     }
     return act;
   }
 };
 
-
 //! Cuda Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 1;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::cuda::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 1;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::cuda::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::cuda::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::cuda::cuda_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::cuda::cuda_atomic_available<T>::value;
 
   //! cuda reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      cuda::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          cuda::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            cuda::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      cuda::ReduceLastBlock_Data<Combiner,
+                                 Accessor,
+                                 T,
+                                 replication,
+                                 atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              cuda::ReduceAtomicDeviceInit_Data<Combiner,
+                                                Accessor,
+                                                T,
+                                                replication,
+                                                atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  cuda::ReduceAtomicHostInit_Data<Combiner,
+                                                  T,
+                                                  replication,
+                                                  atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -936,11 +1047,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -954,16 +1064,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -978,25 +1090,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1005,15 +1127,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1022,12 +1147,13 @@ class Reduce
     }
     return val.value;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1045,7 +1171,7 @@ class Reduce
 }  // end namespace cuda
 
 //! specialization of ReduceSum for cuda_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
     : public cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
@@ -1053,6 +1179,7 @@ class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 public:
   using Base = cuda::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceSum& operator+=(T rhs) const
@@ -1063,7 +1190,7 @@ class ReduceSum<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceBitOr for cuda_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
     : public cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
@@ -1071,6 +1198,7 @@ class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 public:
   using Base = cuda::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceBitOr& operator|=(T rhs) const
@@ -1081,7 +1209,7 @@ class ReduceBitOr<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceBitAnd for cuda_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
     : public cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
@@ -1089,6 +1217,7 @@ class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 public:
   using Base = cuda::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceBitAnd& operator&=(T rhs) const
@@ -1099,7 +1228,7 @@ class ReduceBitAnd<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMin for cuda_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
     : public cuda::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
@@ -1107,6 +1236,7 @@ class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 public:
   using Base = cuda::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
+
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceMin& min(T rhs) const
@@ -1117,7 +1247,7 @@ class ReduceMin<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMax for cuda_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
     : public cuda::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
@@ -1125,6 +1255,7 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 public:
   using Base = cuda::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
+
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceMax& max(T rhs) const
@@ -1135,35 +1266,41 @@ class ReduceMax<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMinLoc for cuda_reduce
-template <typename tuning, typename T, typename IndexType>
+template<typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1185,35 +1322,41 @@ class ReduceMinLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for cuda_reduce
-template <typename tuning, typename T, typename IndexType>
+template<typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::cuda::cuda_reduce_policy<tuning>, T, IndexType>
-    : public cuda::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public cuda::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = cuda::Reduce<Combiner, value_type, tuning>;
+  using Base           = cuda::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/cuda/scan.hpp b/include/RAJA/policy/cuda/scan.hpp
index 0a9b0bf305..1c025deff0 100644
--- a/include/RAJA/policy/cuda/scan.hpp
+++ b/include/RAJA/policy/cuda/scan.hpp
@@ -42,18 +42,20 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename InputIter,
-          typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive_inplace(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename InputIter,
+         typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -62,27 +64,19 @@ inclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -95,19 +89,21 @@ inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename InputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive_inplace(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename InputIter,
+         typename Function,
+         typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive_inplace(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -117,29 +113,19 @@ exclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, begin,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -152,19 +138,21 @@ exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-inclusive(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename InputIter,
+         typename OutputIter,
+         typename Function>
+RAJA_INLINE resources::EventProxy<resources::Cuda> inclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -174,27 +162,17 @@ inclusive(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
-  cudaErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              len,
-                                              stream));
+  cudaErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
@@ -207,20 +185,22 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          size_t BLOCKS_PER_SM,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Cuda>
-exclusive(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename InputIter,
+         typename OutputIter,
+         typename Function,
+         typename T>
+RAJA_INLINE resources::EventProxy<resources::Cuda> exclusive(
     resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
+    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                             IterationGetter,
+                                             Concretizer,
+                                             BLOCKS_PER_SM,
+                                             Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -231,29 +211,19 @@ exclusive(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
   // Run
   cudaErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              out,
-                                              binary_op,
-                                              init,
-                                              len,
-                                              stream));
+                                              temp_storage_bytes, begin, out,
+                                              binary_op, init, len, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
diff --git a/include/RAJA/policy/cuda/sort.hpp b/include/RAJA/policy/cuda/sort.hpp
index c5a353b704..fdfde616d2 100644
--- a/include/RAJA/policy/cuda/sort.hpp
+++ b/include/RAJA/policy/cuda/sort.hpp
@@ -44,32 +44,44 @@ namespace sort
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "stable_sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "stable_sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
+  static_assert(
+      type_traits::is_arithmetic<iterval>::value,
       "stable_sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "stable_sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "stable_sort<cuda_exec> is only implemented for "
+                "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -77,26 +89,32 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -106,15 +124,11 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
@@ -122,19 +136,17 @@ stable(
 
   // Run
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -147,26 +159,32 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Cuda cuda_res,
+       ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                IterationGetter,
+                                                Concretizer,
+                                                BLOCKS_PER_SM,
+                                                Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = cuda::device_mempool_type::getInstance().malloc<R>(len);
@@ -176,35 +194,29 @@ stable(
   cub::DoubleBuffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_out) {
+  if (d_keys.Current() == d_out)
+  {
 
     // copy
-    cudaErrchk(cudaMemcpyAsync(begin, d_out, len*sizeof(R), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(begin, d_out, len * sizeof(R), cudaMemcpyDefault,
+                               stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_out);
@@ -214,36 +226,46 @@ stable(
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    Iter,
-    Iter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert (std::is_pointer<Iter>::value,
-      "sort<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<Iter>::value,
+                "sort<cuda_exec> is only implemented for pointers");
   using iterval = RAJA::detail::IterVal<Iter>;
-  static_assert (type_traits::is_arithmetic<iterval>::value,
-      "sort<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<iterval>>,
-      camp::is_same<Compare, operators::greater<iterval>>>::value,
-      "sort<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(type_traits::is_arithmetic<iterval>::value,
+                "sort<cuda_exec> is only implemented for arithmetic types");
+  static_assert(concepts::any_of<
+                    camp::is_same<Compare, operators::less<iterval>>,
+                    camp::is_same<Compare, operators::greater<iterval>>>::value,
+                "sort<cuda_exec> is only implemented for RAJA::operators::less "
+                "or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -251,18 +273,24 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
@@ -270,56 +298,77 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Cuda cuda_res,
+         ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                  IterationGetter,
+                                                  Concretizer,
+                                                  BLOCKS_PER_SM,
+                                                  Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(cuda_res, p, begin, end, comp);
 }
 
-
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter,
+             KeyIter,
+             ValIter,
+             Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
+  static_assert(
+      std::is_pointer<KeyIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
+  static_assert(
+      std::is_pointer<ValIter>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<cuda_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -327,29 +376,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>>)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::less<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -361,42 +418,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -410,29 +461,37 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-stable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>>)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+stable_pairs(resources::Cuda cuda_res,
+             ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                      IterationGetter,
+                                                      Concretizer,
+                                                      BLOCKS_PER_SM,
+                                                      Async>,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             operators::greater<RAJA::detail::IterVal<KeyIter>>)
 {
   cudaStream_t stream = cuda_res.get_stream();
 
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = cuda::device_mempool_type::getInstance().malloc<K>(len);
@@ -444,42 +503,36 @@ stable_pairs(
   cub::DoubleBuffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Allocate temporary storage
   d_temp_storage =
       cuda::device_mempool_type::getInstance().malloc<unsigned char>(
           temp_storage_bytes);
 
   // Run
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
   // Free temporary storage
   cuda::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (d_keys.Current() == d_keys_out) {
+  if (d_keys.Current() == d_keys_out)
+  {
 
     // copy keys
-    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                               cudaMemcpyDefault, stream));
   }
-  if (d_vals.Current() == d_vals_out) {
+  if (d_vals.Current() == d_vals_out)
+  {
 
     // copy vals
-    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), cudaMemcpyDefault, stream));
+    cudaErrchk(cudaMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                               cudaMemcpyDefault, stream));
   }
 
   cuda::device_mempool_type::getInstance().free(d_keys_out);
@@ -490,40 +543,53 @@ stable_pairs(
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
 
-
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>,
-    KeyIter,
-    KeyIter,
-    ValIter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>,
+               KeyIter,
+               KeyIter,
+               ValIter,
+               Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<cuda_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<cuda_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<cuda_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Cuda>(cuda_res);
 }
@@ -531,20 +597,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::less<RAJA::detail::IterVal<KeyIter>> comp)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::less<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
@@ -552,20 +626,28 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, size_t BLOCKS_PER_SM, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Cuda>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
-unstable_pairs(
-    resources::Cuda cuda_res,
-    ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async> p,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Cuda>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
+unstable_pairs(resources::Cuda cuda_res,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async> p,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               operators::greater<RAJA::detail::IterVal<KeyIter>> comp)
 {
   return stable_pairs(cuda_res, p, keys_begin, keys_end, vals_begin, comp);
 }
diff --git a/include/RAJA/policy/desul/atomic.hpp b/include/RAJA/policy/desul/atomic.hpp
index 71bf429079..0090920296 100644
--- a/include/RAJA/policy/desul/atomic.hpp
+++ b/include/RAJA/policy/desul/atomic.hpp
@@ -22,182 +22,136 @@
 using raja_default_desul_order = desul::MemoryOrderRelaxed;
 using raja_default_desul_scope = desul::MemoryScopeDevice;
 
-
 namespace RAJA
 {
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicLoad(AtomicPolicy, T *acc)
-{
-  return desul::atomic_load(acc,
-                            raja_default_desul_order{},
-                            raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(AtomicPolicy, T* acc)
+{
+  return desul::atomic_load(acc, raja_default_desul_order {},
+                            raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void
-atomicStore(AtomicPolicy, T *acc, T value)
-{
-  desul::atomic_store(acc,
-                      value,
-                      raja_default_desul_order{},
-                      raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(AtomicPolicy, T* acc, T value)
+{
+  desul::atomic_store(acc, value, raja_default_desul_order {},
+                      raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicAdd(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_add(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_add(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T
-atomicSub(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_sub(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_sub(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_min(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_min(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_max(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_max(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc)
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_inc(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_inc(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(AtomicPolicy, T *acc, T val)
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicinc
-  return desul::atomic_fetch_inc_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_inc_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc)
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc)
 {
-  return desul::atomic_fetch_dec(acc,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+  return desul::atomic_fetch_dec(acc, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(AtomicPolicy, T *acc, T val)
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(AtomicPolicy, T* acc, T val)
 {
   // See:
   // http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomicdec
-  return desul::atomic_fetch_dec_mod(acc,
-                                     val,
-                                     raja_default_desul_order{},
-                                     raja_default_desul_scope{});
+  return desul::atomic_fetch_dec_mod(acc, val, raja_default_desul_order {},
+                                     raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_and(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_and(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_or(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_or(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_fetch_xor(acc,
-                                 value,
-                                 raja_default_desul_order{},
-                                 raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_fetch_xor(acc, value, raja_default_desul_order {},
+                                 raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(AtomicPolicy, T *acc, T value)
-{
-  return desul::atomic_exchange(acc,
-                                value,
-                                raja_default_desul_order{},
-                                raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(AtomicPolicy, T* acc, T value)
+{
+  return desul::atomic_exchange(acc, value, raja_default_desul_order {},
+                                raja_default_desul_scope {});
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename AtomicPolicy, typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(AtomicPolicy, T *acc, T compare, T value)
-{
-  return desul::atomic_compare_exchange(acc,
-                                        compare,
-                                        value,
-                                        raja_default_desul_order{},
-                                        raja_default_desul_scope{});
+template<typename AtomicPolicy, typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T
+atomicCAS(AtomicPolicy, T* acc, T compare, T value)
+{
+  return desul::atomic_compare_exchange(acc, compare, value,
+                                        raja_default_desul_order {},
+                                        raja_default_desul_scope {});
 }
 
 }  // namespace RAJA
 
 #endif  // RAJA_ENABLE_DESUL_ATOMICS
-#endif // guard
+#endif  // guard
diff --git a/include/RAJA/policy/hip/MemUtils_HIP.hpp b/include/RAJA/policy/hip/MemUtils_HIP.hpp
index f1f69eab5e..f0dcac7e97 100644
--- a/include/RAJA/policy/hip/MemUtils_HIP.hpp
+++ b/include/RAJA/policy/hip/MemUtils_HIP.hpp
@@ -70,16 +70,16 @@ hipDeviceProp_t& device_prop()
   return prop;
 }
 
-
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     hipErrchk(hipHostMalloc(&ptr, nbytes,
-        hipHostMallocMapped | hipHostMallocNonCoherent));
+                            hipHostMallocMapped | hipHostMallocNonCoherent));
     return ptr;
   }
 
@@ -92,7 +92,8 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -112,7 +113,8 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -134,7 +136,8 @@ struct DeviceZeroedAllocator {
 };
 
 //! Allocator for device pinned memory for use in basic_mempool
-struct DevicePinnedAllocator {
+struct DevicePinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
@@ -155,22 +158,26 @@ struct DevicePinnedAllocator {
 using device_mempool_type = basic_mempool::MemPool<DeviceAllocator>;
 using device_zeroed_mempool_type =
     basic_mempool::MemPool<DeviceZeroedAllocator>;
-using device_pinned_mempool_type = basic_mempool::MemPool<DevicePinnedAllocator>;
+using device_pinned_mempool_type =
+    basic_mempool::MemPool<DevicePinnedAllocator>;
 using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 
 namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct hipInfo {
+struct hipInfo
+{
   const void* func = nullptr;
-  hip_dim_t gridDim{0, 0, 0};
-  hip_dim_t blockDim{0, 0, 0};
+  hip_dim_t gridDim {0, 0, 0};
+  hip_dim_t blockDim {0, 0, 0};
   size_t* dynamic_smem = nullptr;
-  ::RAJA::resources::Hip res{::RAJA::resources::Hip::HipFromStream(0,0)};
+  ::RAJA::resources::Hip res {::RAJA::resources::Hip::HipFromStream(0, 0)};
   bool setup_reducers = false;
 };
-struct hipStatusInfo : hipInfo {
+
+struct hipStatusInfo : hipInfo
+{
 #if defined(RAJA_ENABLE_OPENMP)
   omp::mutex lock;
 #endif
@@ -187,10 +194,7 @@ extern hipStatusInfo tl_status;
 extern std::unordered_map<hipStream_t, bool> g_stream_info_map;
 
 RAJA_INLINE
-void synchronize_impl(::RAJA::resources::Hip res)
-{
-  res.wait();
-}
+void synchronize_impl(::RAJA::resources::Hip res) { res.wait(); }
 
 }  // namespace detail
 
@@ -202,13 +206,16 @@ void synchronize()
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   bool synchronize = false;
-  for (auto& val : detail::g_stream_info_map) {
-    if (!val.second) {
+  for (auto& val : detail::g_stream_info_map)
+  {
+    if (!val.second)
+    {
       synchronize = true;
-      val.second = true;
+      val.second  = true;
     }
   }
-  if (synchronize) {
+  if (synchronize)
+  {
     hipErrchk(hipDeviceSynchronize());
   }
 }
@@ -221,12 +228,16 @@ void synchronize(::RAJA::resources::Hip res)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
-    if (!iter->second) {
+  if (iter != detail::g_stream_info_map.end())
+  {
+    if (!iter->second)
+    {
       iter->second = true;
       detail::synchronize_impl(res);
     }
-  } else {
+  }
+  else
+  {
     RAJA_ABORT_OR_THROW("Cannot synchronize unknown resource.");
   }
 }
@@ -239,30 +250,41 @@ void launch(::RAJA::resources::Hip res, bool async = true)
   lock_guard<omp::mutex> lock(detail::g_status.lock);
 #endif
   auto iter = detail::g_stream_info_map.find(res.get_stream());
-  if (iter != detail::g_stream_info_map.end()) {
+  if (iter != detail::g_stream_info_map.end())
+  {
     iter->second = !async;
-  } else {
+  }
+  else
+  {
     detail::g_stream_info_map.emplace(res.get_stream(), !async);
   }
-  if (!async) {
+  if (!async)
+  {
     detail::synchronize_impl(res);
   }
 }
 
 //! Launch kernel and indicate resource synchronization status
 RAJA_INLINE
-void launch(const void* func, hip_dim_t gridDim, hip_dim_t blockDim, void** args, size_t shmem,
-            ::RAJA::resources::Hip res, bool async = true, const char *name = nullptr)
+void launch(const void* func,
+            hip_dim_t gridDim,
+            hip_dim_t blockDim,
+            void** args,
+            size_t shmem,
+            ::RAJA::resources::Hip res,
+            bool async       = true,
+            const char* name = nullptr)
 {
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePush(name);
-  #else
-    RAJA_UNUSED_VAR(name);
-  #endif
-  hipErrchk(hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
-  #if defined(RAJA_ENABLE_ROCTX)
-  if(name) roctxRangePop();
-  #endif
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePush(name);
+#else
+  RAJA_UNUSED_VAR(name);
+#endif
+  hipErrchk(
+      hipLaunchKernel(func, gridDim, blockDim, args, shmem, res.get_stream()));
+#if defined(RAJA_ENABLE_ROCTX)
+  if (name) roctxRangePop();
+#endif
   launch(res, async);
 }
 
@@ -280,9 +302,11 @@ hip_dim_t currentGridDim() { return detail::tl_status.gridDim; }
 
 //! get grid size of current launch
 RAJA_INLINE
-hip_dim_member_t currentGridSize() { return detail::tl_status.gridDim.x *
-                                            detail::tl_status.gridDim.y *
-                                            detail::tl_status.gridDim.z; }
+hip_dim_member_t currentGridSize()
+{
+  return detail::tl_status.gridDim.x * detail::tl_status.gridDim.y *
+         detail::tl_status.gridDim.z;
+}
 
 //! get blockDim of current launch
 RAJA_INLINE
@@ -290,9 +314,11 @@ hip_dim_t currentBlockDim() { return detail::tl_status.blockDim; }
 
 //! get block size of current launch
 RAJA_INLINE
-hip_dim_member_t currentBlockSize() { return detail::tl_status.blockDim.x *
-                                             detail::tl_status.blockDim.y *
-                                             detail::tl_status.blockDim.z; }
+hip_dim_member_t currentBlockSize()
+{
+  return detail::tl_status.blockDim.x * detail::tl_status.blockDim.y *
+         detail::tl_status.blockDim.z;
+}
 
 //! get dynamic shared memory usage for current launch
 RAJA_INLINE
@@ -307,7 +333,8 @@ size_t maxDynamicShmem()
   return func_attr.maxDynamicSharedSizeBytes;
 }
 
-constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::max();
+constexpr size_t dynamic_smem_allocation_failure =
+    std::numeric_limits<size_t>::max();
 
 //! Allocate dynamic shared memory for current launch
 //
@@ -319,24 +346,27 @@ constexpr size_t dynamic_smem_allocation_failure = std::numeric_limits<size_t>::
 //  Returns an offset into dynamic shared memory aligned to align on success,
 //  or dynamic_smem_allocation_failure on failure. Note that asking for 0 memory
 //  takes the failure return path.
-template < typename T, typename GetNFromMax >
-RAJA_INLINE
-size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max, size_t align = alignof(T))
+template<typename T, typename GetNFromMax>
+RAJA_INLINE size_t allocateDynamicShmem(GetNFromMax&& get_n_from_max,
+                                        size_t align = alignof(T))
 {
   const size_t unaligned_shmem = *detail::tl_status.dynamic_smem;
-  const size_t align_offset = ((unaligned_shmem % align) != size_t(0))
-      ? align - (unaligned_shmem % align)
-      : size_t(0);
-  const size_t aligned_shmem = unaligned_shmem + align_offset;
+  const size_t align_offset    = ((unaligned_shmem % align) != size_t(0))
+                                     ? align - (unaligned_shmem % align)
+                                     : size_t(0);
+  const size_t aligned_shmem   = unaligned_shmem + align_offset;
 
   const size_t max_shmem_bytes = maxDynamicShmem() - aligned_shmem;
-  const size_t n_bytes = sizeof(T) *
-      std::forward<GetNFromMax>(get_n_from_max)(max_shmem_bytes / sizeof(T));
+  const size_t n_bytes = sizeof(T) * std::forward<GetNFromMax>(get_n_from_max)(
+                                         max_shmem_bytes / sizeof(T));
 
-  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes) {
+  if (size_t(0) < n_bytes && n_bytes <= max_shmem_bytes)
+  {
     *detail::tl_status.dynamic_smem = aligned_shmem + n_bytes;
     return aligned_shmem;
-  } else {
+  }
+  else
+  {
     return dynamic_smem_allocation_failure;
   }
 }
@@ -350,7 +380,7 @@ ::RAJA::resources::Hip currentResource() { return detail::tl_status.res; }
 // Note: This is done to setup the Reducer and MultiReducer objects through
 // their copy constructors. Both look at tl_status to setup per kernel launch
 // resources.
-template <typename LOOP_BODY>
+template<typename LOOP_BODY>
 RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     const void* func,
     hip_dim_t gridDim,
@@ -359,14 +389,14 @@ RAJA_INLINE typename std::remove_reference<LOOP_BODY>::type make_launch_body(
     ::RAJA::resources::Hip res,
     LOOP_BODY&& loop_body)
 {
-  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(detail::tl_status,
-      detail::hipInfo{func, gridDim, blockDim, &dynamic_smem, res, true});
+  ::RAJA::detail::ScopedAssignment<detail::hipInfo> info_sa(
+      detail::tl_status,
+      detail::hipInfo {func, gridDim, blockDim, &dynamic_smem, res, true});
 
   using return_type = typename std::remove_reference<LOOP_BODY>::type;
   return return_type(std::forward<LOOP_BODY>(loop_body));
 }
 
-
 static constexpr int hip_occupancy_uninitialized_int = -1;
 static constexpr size_t hip_occupancy_uninitialized_size_t =
     std::numeric_limits<size_t>::max();
@@ -375,7 +405,8 @@ static constexpr size_t hip_occupancy_uninitialized_size_t =
 struct HipFixedMaxBlocksData
 {
   int device_sm_per_device = hip::device_prop().multiProcessorCount;
-  int device_max_threads_per_sm = hip::device_prop().maxThreadsPerMultiProcessor;
+  int device_max_threads_per_sm =
+      hip::device_prop().maxThreadsPerMultiProcessor;
 };
 
 //! Get the maximum theoretical occupancy of the device
@@ -391,32 +422,33 @@ HipFixedMaxBlocksData hip_max_blocks()
 struct HipOccMaxBlocksThreadsData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_max_blocks_per_device = hip_occupancy_uninitialized_int;
-  int func_max_threads_per_block = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_device      = hip_occupancy_uninitialized_int;
+  int func_max_threads_per_block      = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with unknown threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksThreadsData
+hip_occupancy_max_blocks_threads(const void* func,
+                                 size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksThreadsData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxPotentialBlockSize(
-        &data.func_max_blocks_per_device, &data.func_max_threads_per_block, func, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_device, &data.func_max_threads_per_block,
+        func, func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    hipDeviceProp_t& prop = hip::device_prop();
+    hipDeviceProp_t& prop           = hip::device_prop();
     data.func_max_blocks_per_device = prop.multiProcessorCount;
     data.func_max_threads_per_block = 1024;
 #endif
-
   }
 
   return data;
@@ -426,66 +458,75 @@ HipOccMaxBlocksThreadsData hip_occupancy_max_blocks_threads(const void* func,
 struct HipOccMaxBlocksData : HipFixedMaxBlocksData
 {
   size_t func_dynamic_shmem_per_block = hip_occupancy_uninitialized_size_t;
-  int func_threads_per_block = hip_occupancy_uninitialized_int;
-  int func_max_blocks_per_sm = hip_occupancy_uninitialized_int;
+  int func_threads_per_block          = hip_occupancy_uninitialized_int;
+  int func_max_blocks_per_sm          = hip_occupancy_uninitialized_int;
 };
 
 //! Get the maximum occupancy of a kernel with compile time threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker), int func_threads_per_block>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func, size_t func_dynamic_shmem_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
 //! Get the maximum occupancy of a kernel with runtime threads per block
-template < typename RAJA_UNUSED_ARG(UniqueMarker) >
-RAJA_INLINE
-HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
-    size_t func_dynamic_shmem_per_block, int func_threads_per_block)
+template<typename RAJA_UNUSED_ARG(UniqueMarker)>
+RAJA_INLINE HipOccMaxBlocksData
+hip_occupancy_max_blocks(const void* func,
+                         size_t func_dynamic_shmem_per_block,
+                         int func_threads_per_block)
 {
   static thread_local HipOccMaxBlocksData data;
 
-  if ( data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
-       data.func_threads_per_block != func_threads_per_block ) {
+  if (data.func_dynamic_shmem_per_block != func_dynamic_shmem_per_block ||
+      data.func_threads_per_block != func_threads_per_block)
+  {
 
     data.func_dynamic_shmem_per_block = func_dynamic_shmem_per_block;
-    data.func_threads_per_block = func_threads_per_block;
+    data.func_threads_per_block       = func_threads_per_block;
 
 #ifdef RAJA_ENABLE_HIP_OCCUPANCY_CALCULATOR
     hipErrchk(hipOccupancyMaxActiveBlocksPerMultiprocessor(
-        &data.func_max_blocks_per_sm, func, func_threads_per_block, func_dynamic_shmem_per_block));
+        &data.func_max_blocks_per_sm, func, func_threads_per_block,
+        func_dynamic_shmem_per_block));
 #else
     RAJA_UNUSED_VAR(func);
-    data.func_max_blocks_per_sm = hip::device_prop().maxThreadsPerMultiProcessor/1024;
-    if (data.func_max_blocks_per_sm <= 0) { data.func_max_blocks_per_sm = 1 }
+    data.func_max_blocks_per_sm =
+        hip::device_prop().maxThreadsPerMultiProcessor / 1024;
+    if (data.func_max_blocks_per_sm <= 0)
+    {
+      data.func_max_blocks_per_sm = 1
+    }
 #endif
-
   }
 
   return data;
 }
 
-
 /*!
  ******************************************************************************
  *
@@ -512,14 +553,16 @@ HipOccMaxBlocksData hip_occupancy_max_blocks(const void* func,
  *
  ******************************************************************************
  */
-template < typename IdxT, typename Concretizer, typename UniqueMarker>
+template<typename IdxT, typename Concretizer, typename UniqueMarker>
 struct ConcretizerImpl
 {
-  ConcretizerImpl(const void* func, size_t func_dynamic_shmem_per_block, IdxT len)
-    : m_func(func)
-    , m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block)
-    , m_len(len)
-  { }
+  ConcretizerImpl(const void* func,
+                  size_t func_dynamic_shmem_per_block,
+                  IdxT len)
+      : m_func(func),
+        m_func_dynamic_shmem_per_block(func_dynamic_shmem_per_block),
+        m_len(len)
+  {}
 
   IdxT get_max_block_size() const
   {
@@ -533,10 +576,14 @@ struct ConcretizerImpl
   IdxT get_block_size_to_fit_len(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
-    if (func_threads_per_block <= func_max_threads_per_block) {
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    if (func_threads_per_block <= func_max_threads_per_block)
+    {
       return func_threads_per_block;
-    } else {
+    }
+    else
+    {
       return IdxT(0);
     }
   }
@@ -544,7 +591,8 @@ struct ConcretizerImpl
   //! Get a grid size when block size is specified
   IdxT get_grid_size_to_fit_len(IdxT func_threads_per_block) const
   {
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return func_blocks_per_device;
   }
 
@@ -552,16 +600,17 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_len() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
   //! Get a block size when grid size is specified
   IdxT get_block_size_to_fit_device(IdxT func_blocks_per_device) const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_threads_per_block = RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
+    IdxT func_threads_per_block =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_blocks_per_device);
     return std::min(func_threads_per_block, func_max_threads_per_block);
   }
 
@@ -570,8 +619,10 @@ struct ConcretizerImpl
   {
     auto data = hip_occupancy_max_blocks<UniqueMarker>(
         m_func, m_func_dynamic_shmem_per_block, func_threads_per_block);
-    IdxT func_max_blocks_per_device = Concretizer::template get_max_grid_size<IdxT>(data);
-    IdxT func_blocks_per_device = RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
+    IdxT func_max_blocks_per_device =
+        Concretizer::template get_max_grid_size<IdxT>(data);
+    IdxT func_blocks_per_device =
+        RAJA_DIVIDE_CEILING_INT(m_len, func_threads_per_block);
     return std::min(func_blocks_per_device, func_max_blocks_per_device);
   }
 
@@ -579,9 +630,9 @@ struct ConcretizerImpl
   auto get_block_and_grid_size_to_fit_device() const
   {
     IdxT func_max_threads_per_block = this->get_max_block_size();
-    IdxT func_blocks_per_device = this->get_grid_size_to_fit_device(func_max_threads_per_block);
-    return std::make_pair(func_max_threads_per_block,
-                          func_blocks_per_device);
+    IdxT func_blocks_per_device =
+        this->get_grid_size_to_fit_device(func_max_threads_per_block);
+    return std::make_pair(func_max_threads_per_block, func_blocks_per_device);
   }
 
 private:
diff --git a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
index 975d26b7ff..6f965f1ce0 100644
--- a/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/Dispatcher.hpp
@@ -29,7 +29,6 @@
 #include <thread>
 #include <mutex>
 
-
 namespace RAJA
 {
 
@@ -41,9 +40,9 @@ namespace hip
 
 // global function that creates the value on the device using the
 // factory and writes it into a pinned ptr
-template < typename Factory >
-__global__ void get_value_global(
-    typename Factory::value_type* ptr, Factory factory)
+template<typename Factory>
+__global__ void get_value_global(typename Factory::value_type* ptr,
+                                 Factory factory)
 {
   *ptr = factory();
 }
@@ -52,8 +51,9 @@ __global__ void get_value_global(
 inline void* get_cached_value_ptr(size_t nbytes)
 {
   static size_t cached_nbytes = 0;
-  static void* ptr = nullptr;
-  if (nbytes > cached_nbytes) {
+  static void* ptr            = nullptr;
+  if (nbytes > cached_nbytes)
+  {
     cached_nbytes = 0;
     hipErrchk(hipHostFree(ptr));
     hipErrchk(hipHostMalloc(&ptr, nbytes));
@@ -73,7 +73,7 @@ inline std::mutex& get_value_mutex()
 // get the device function pointer by calling a global function to
 // write it into a pinned ptr, beware different instantiates of this
 // function may run concurrently
-template < typename Factory >
+template<typename Factory>
 inline auto get_value(Factory&& factory)
 {
   using value_type = typename std::decay_t<Factory>::value_type;
@@ -81,8 +81,9 @@ inline auto get_value(Factory&& factory)
 
   auto res = ::camp::resources::Hip::get_default();
   auto ptr = static_cast<value_type*>(get_cached_value_ptr(sizeof(value_type)));
-  auto func = reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
-  void *args[] = {(void*)&ptr, (void*)&factory};
+  auto func =
+      reinterpret_cast<const void*>(&get_value_global<std::decay_t<Factory>>);
+  void* args[] = {(void*)&ptr, (void*)&factory};
   hipErrchk(hipLaunchKernel(func, 1, 1, args, 0, res.get_stream()));
   hipErrchk(hipStreamSynchronize(res.get_stream()));
 
@@ -91,7 +92,7 @@ inline auto get_value(Factory&& factory)
 
 // get the device function pointer and store it so it can be used
 // multiple times
-template < typename Factory >
+template<typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -101,17 +102,15 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace hip
 
 /*!
-* Populate and return a Dispatcher object that can be used in device code
-*/
-template < typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async >
+ * Populate and return a Dispatcher object that can be used in device code
+ */
+template<typename T, typename Dispatcher_T, size_t BLOCK_SIZE, bool Async>
 inline const Dispatcher_T* get_Dispatcher(hip_work<BLOCK_SIZE, Async> const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return hip::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return hip::get_cached_value(std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
index 26d45d7bd9..b01d7ce219 100644
--- a/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/hip/WorkGroup/WorkRunner.hpp
@@ -25,7 +25,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
 
-
 namespace RAJA
 {
 
@@ -36,46 +35,45 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+template<size_t BLOCK_SIZE,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallOrdered<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallOrdered<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template<typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -83,8 +81,12 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
@@ -95,46 +97,45 @@ struct WorkRunner<
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+template<size_t BLOCK_SIZE,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::hip_work<BLOCK_SIZE, Async>,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                              RAJA::hip_work<BLOCK_SIZE, Async>,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
 {
-  using base = WorkRunnerForallReverse<
-        RAJA::hip_exec_async<BLOCK_SIZE>,
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+  using base = WorkRunnerForallReverse<RAJA::hip_exec_async<BLOCK_SIZE>,
+                                       RAJA::hip_work<BLOCK_SIZE, Async>,
+                                       RAJA::reverse_ordered,
+                                       DISPATCH_POLICY_T,
+                                       ALLOCATOR_T,
+                                       INDEX_T,
+                                       Args...>;
   using base::base;
-  using IndexType = INDEX_T;
+  using IndexType       = INDEX_T;
   using per_run_storage = typename base::per_run_storage;
 
   ///
   /// run the loops in the given work container in reverse order using forall
   /// run all loops asynchronously and synchronize after is necessary
   ///
-  template < typename WorkContainer >
+  template<typename WorkContainer>
   per_run_storage run(WorkContainer const& storage,
-                      typename base::resource_type r, Args... args) const
+                      typename base::resource_type r,
+                      Args... args) const
   {
     per_run_storage run_storage =
         base::run(storage, r, std::forward<Args>(args)...);
@@ -142,28 +143,33 @@ struct WorkRunner<
     IndexType num_loops = std::distance(std::begin(storage), std::end(storage));
 
     // Only synchronize if we had something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
-      if (!Async) { RAJA::hip::synchronize(r); }
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
+      if (!Async)
+      {
+        RAJA::hip::synchronize(r);
+      }
     }
 
     return run_storage;
   }
 };
 
-
 /*!
  * A body and segment holder for storing loops that will be executed
  * on the device
  */
-template <typename Segment_type, typename LoopBody,
-          typename index_type, typename ... Args>
+template<typename Segment_type,
+         typename LoopBody,
+         typename index_type,
+         typename... Args>
 struct HoldHipDeviceXThreadblockLoop
 {
-  template < typename segment_in, typename body_in >
+  template<typename segment_in, typename body_in>
   HoldHipDeviceXThreadblockLoop(segment_in&& segment, body_in&& body)
-    : m_segment(std::forward<segment_in>(segment))
-    , m_body(std::forward<body_in>(body))
-  { }
+      : m_segment(std::forward<segment_in>(segment)),
+        m_body(std::forward<body_in>(body))
+  {}
 
   RAJA_DEVICE RAJA_INLINE void operator()(Args... args) const
   {
@@ -171,10 +177,11 @@ struct HoldHipDeviceXThreadblockLoop
     // TODO:: decide whether or not to privatize the loop body
     const index_type i_begin = threadIdx.x + blockIdx.x * blockDim.x;
     const index_type stride  = blockDim.x * gridDim.x;
-    const auto begin = m_segment.begin();
-    const auto end   = m_segment.end();
+    const auto begin         = m_segment.begin();
+    const auto end           = m_segment.end();
     const index_type len(end - begin);
-    for ( index_type i = i_begin; i < len; i += stride ) {
+    for (index_type i = i_begin; i < len; i += stride)
+    {
       m_body(begin[i], std::forward<Args>(args)...);
     }
   }
@@ -184,11 +191,11 @@ struct HoldHipDeviceXThreadblockLoop
   LoopBody m_body;
 };
 
-template < size_t BLOCK_SIZE,
-           typename StorageIter,
-           typename value_type,
-           typename index_type,
-           typename ... Args >
+template<size_t BLOCK_SIZE,
+         typename StorageIter,
+         typename value_type,
+         typename index_type,
+         typename... Args>
 __launch_bounds__(BLOCK_SIZE, 1) __global__
     void hip_unordered_y_block_global(StorageIter iter, Args... args)
 {
@@ -198,43 +205,47 @@ __launch_bounds__(BLOCK_SIZE, 1) __global__
   value_type::device_call(&iter[i_loop], args...);
 }
 
-
 /*!
  * Runs work in a storage container out of order with loops mapping to
  * hip blocks in the y direction and iterations mapping to threads in
  * the x direction, with the number of threads in the x dimension determined
  * by the average number of iterates per loop
  */
-template <size_t BLOCK_SIZE, bool Async,
-          typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<size_t BLOCK_SIZE,
+         bool Async,
+         typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    DISPATCH_POLICY_T,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>
 {
   using exec_policy = RAJA::hip_work<BLOCK_SIZE, Async>;
-  using order_policy = RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
+  using order_policy =
+      RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
   using dispatch_policy = DISPATCH_POLICY_T;
-  using Allocator = ALLOCATOR_T;
-  using index_type = INDEX_T;
-  using resource_type = resources::Hip;
+  using Allocator       = ALLOCATOR_T;
+  using index_type      = INDEX_T;
+  using resource_type   = resources::Hip;
 
   // The type that will hold the segment and loop body in work storage
-  struct holder_type {
-    template < typename T >
+  struct holder_type
+  {
+    template<typename T>
     using type = HoldHipDeviceXThreadblockLoop<
-        typename camp::at<T, camp::num<0>>::type, // ITERABLE
-        typename camp::at<T, camp::num<1>>::type, // LOOP_BODY
-        index_type, Args...>;
+        typename camp::at<T, camp::num<0>>::type,  // ITERABLE
+        typename camp::at<T, camp::num<1>>::type,  // LOOP_BODY
+        index_type,
+        Args...>;
   };
+
   ///
-  template < typename T >
+  template<typename T>
   using holder_type_t = typename holder_type::template type<T>;
 
   // The policy indicating where the call function is invoked
@@ -243,21 +254,25 @@ struct WorkRunner<
 
   // The Dispatcher policy with holder_types used internally to handle the
   // ranges and callables passed in by the user.
-  using dispatcher_holder_policy = dispatcher_transform_types_t<dispatch_policy, holder_type>;
+  using dispatcher_holder_policy =
+      dispatcher_transform_types_t<dispatch_policy, holder_type>;
 
-  using dispatcher_type = Dispatcher<Platform::hip, dispatcher_holder_policy, RAJA::hip_work<BLOCK_SIZE, true>, Args...>;
+  using dispatcher_type = Dispatcher<Platform::hip,
+                                     dispatcher_holder_policy,
+                                     RAJA::hip_work<BLOCK_SIZE, true>,
+                                     Args...>;
 
   WorkRunner() = default;
 
-  WorkRunner(WorkRunner const&) = delete;
+  WorkRunner(WorkRunner const&)            = delete;
   WorkRunner& operator=(WorkRunner const&) = delete;
 
-  WorkRunner(WorkRunner && o)
-    : m_total_iterations(o.m_total_iterations)
+  WorkRunner(WorkRunner&& o) : m_total_iterations(o.m_total_iterations)
   {
     o.m_total_iterations = 0;
   }
-  WorkRunner& operator=(WorkRunner && o)
+
+  WorkRunner& operator=(WorkRunner&& o)
   {
     m_total_iterations = o.m_total_iterations;
 
@@ -267,35 +282,42 @@ struct WorkRunner<
 
   // runner interfaces with storage to enqueue so the runner can get
   // information from the segment and loop at enqueue time
-  template < typename WorkContainer, typename Iterable, typename LoopBody >
-  inline void enqueue(WorkContainer& storage, Iterable&& iter, LoopBody&& loop_body)
+  template<typename WorkContainer, typename Iterable, typename LoopBody>
+  inline void enqueue(WorkContainer& storage,
+                      Iterable&& iter,
+                      LoopBody&& loop_body)
   {
     using Iterator  = camp::decay<decltype(std::begin(iter))>;
     using LOOP_BODY = camp::decay<LoopBody>;
     using ITERABLE  = camp::decay<Iterable>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+    using IndexType =
+        camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
     using holder = holder_type_t<camp::list<ITERABLE, LOOP_BODY>>;
 
-    // using true_value_type = typename WorkContainer::template true_value_type<holder>;
+    // using true_value_type = typename WorkContainer::template
+    // true_value_type<holder>;
 
     Iterator begin = std::begin(iter);
-    Iterator end = std::end(iter);
-    IndexType len = std::distance(begin, end);
+    Iterator end   = std::end(iter);
+    IndexType len  = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (len > 0 && BLOCK_SIZE > 0) {
+    if (len > 0 && BLOCK_SIZE > 0)
+    {
 
       m_total_iterations += len;
 
       //
-      // TODO: Privatize the loop_body, using make_launch_body to setup reductions
+      // TODO: Privatize the loop_body, using make_launch_body to setup
+      // reductions
       //
       // LOOP_BODY body = RAJA::hip::make_launch_body(func,
-      //     gridSize, blockSize, shmem, stream, std::forward<LoopBody>(loop_body));
+      //     gridSize, blockSize, shmem, stream,
+      //     std::forward<LoopBody>(loop_body));
 
       storage.template emplace<holder>(
-          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy{}),
+          get_Dispatcher<holder, dispatcher_type>(dispatcher_exec_policy {}),
           std::forward<Iterable>(iter), std::forward<LoopBody>(loop_body));
     }
   }
@@ -303,37 +325,44 @@ struct WorkRunner<
   // no extra storage required here
   using per_run_storage = int;
 
-  template < typename WorkContainer >
-  per_run_storage run(WorkContainer const& storage, resource_type r, Args... args) const
+  template<typename WorkContainer>
+  per_run_storage run(WorkContainer const& storage,
+                      resource_type r,
+                      Args... args) const
   {
-    using Iterator  = camp::decay<decltype(std::begin(storage))>;
-    using IndexType = camp::decay<decltype(std::distance(std::begin(storage), std::end(storage)))>;
+    using Iterator   = camp::decay<decltype(std::begin(storage))>;
+    using IndexType  = camp::decay<decltype(std::distance(std::begin(storage),
+                                                          std::end(storage)))>;
     using value_type = typename WorkContainer::value_type;
 
-    per_run_storage run_storage{};
+    per_run_storage run_storage {};
 
-    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type, index_type, Args...>;
+    auto func = hip_unordered_y_block_global<BLOCK_SIZE, Iterator, value_type,
+                                             index_type, Args...>;
 
     //
     // Compute the requested iteration space size
     //
-    Iterator begin = std::begin(storage);
-    Iterator end = std::end(storage);
+    Iterator begin      = std::begin(storage);
+    Iterator end        = std::end(storage);
     IndexType num_loops = std::distance(begin, end);
 
     // Only launch kernel if we have something to iterate over
-    if (num_loops > 0 && BLOCK_SIZE > 0) {
+    if (num_loops > 0 && BLOCK_SIZE > 0)
+    {
 
-      index_type average_iterations = m_total_iterations / static_cast<index_type>(num_loops);
+      index_type average_iterations =
+          m_total_iterations / static_cast<index_type>(num_loops);
 
       //
       // Compute the number of blocks
       //
       constexpr index_type block_size = static_cast<index_type>(BLOCK_SIZE);
-      hip_dim_t blockSize{static_cast<hip_dim_member_t>(block_size), 1, 1};
-      hip_dim_t gridSize{static_cast<hip_dim_member_t>((average_iterations + block_size - 1) / block_size),
-                          static_cast<hip_dim_member_t>(num_loops),
-                          1};
+      hip_dim_t blockSize {static_cast<hip_dim_member_t>(block_size), 1, 1};
+      hip_dim_t gridSize {
+          static_cast<hip_dim_member_t>((average_iterations + block_size - 1) /
+                                        block_size),
+          static_cast<hip_dim_member_t>(num_loops), 1};
 
       RAJA_FT_BEGIN;
 
@@ -346,8 +375,9 @@ struct WorkRunner<
         //
         // Launch the kernel
         //
-        void* func_args[] = { (void*)&begin, (void*)&args... };
-        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args, shmem, r, Async);
+        void* func_args[] = {(void*)&begin, (void*)&args...};
+        RAJA::hip::launch((const void*)func, gridSize, blockSize, func_args,
+                          shmem, r, Async);
       }
 
       RAJA_FT_END;
@@ -357,10 +387,7 @@ struct WorkRunner<
   }
 
   // clear any state so ready to be destroyed or reused
-  void clear()
-  {
-    m_total_iterations = 0;
-  }
+  void clear() { m_total_iterations = 0; }
 
 private:
   index_type m_total_iterations = 0;
@@ -369,29 +396,31 @@ struct WorkRunner<
 #if !defined(RAJA_ENABLE_HIP_INDIRECT_FUNCTION_CALL)
 
 /// leave unsupported runner types incomplete
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<size_t BLOCK_SIZE,
+         bool Async,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_function_call_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_function_call_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 ///
-template <size_t BLOCK_SIZE, bool Async,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
+template<size_t BLOCK_SIZE,
+         bool Async,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
 struct WorkRunner<
-        RAJA::hip_work<BLOCK_SIZE, Async>,
-        RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
-        RAJA::indirect_virtual_function_dispatch,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>;
+    RAJA::hip_work<BLOCK_SIZE, Async>,
+    RAJA::policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average,
+    RAJA::indirect_virtual_function_dispatch,
+    ALLOCATOR_T,
+    INDEX_T,
+    Args...>;
 
 #endif
 
diff --git a/include/RAJA/policy/hip/atomic.hpp b/include/RAJA/policy/hip/atomic.hpp
index b4f0d7faa7..8acdfed19d 100644
--- a/include/RAJA/policy/hip/atomic.hpp
+++ b/include/RAJA/policy/hip/atomic.hpp
@@ -49,11 +49,8 @@ namespace RAJA
 namespace detail
 {
 
-using hip_atomicCommon_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long
->;
+using hip_atomicCommon_builtin_types =
+    ::camp::list<int, unsigned int, unsigned long long>;
 
 /*!
  * Type trait for determining if atomic operators should be implemented
@@ -61,15 +58,14 @@ using hip_atomicCommon_builtin_types = ::camp::list<
  * operators. More specific type traits are added when needed, such as
  * hip_useBuiltinExchange below.
  */
-template <typename T>
-struct hip_useBuiltinCommon {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value;
+template<typename T>
+struct hip_useBuiltinCommon
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value;
 };
 
-
 /*!
  * Type trait for determining if atomic operators should be implemented
  * by reinterpreting inputs to types that the builtin functions support.
@@ -77,26 +73,24 @@ struct hip_useBuiltinCommon {
  * type traits are added when needed, such as hip_useReinterpretExchange
  * below.
  */
-template <typename T>
-struct hip_useReinterpretCommon {
-  static constexpr bool value =
-    !hip_useBuiltinCommon<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+template<typename T>
+struct hip_useReinterpretCommon
+{
+  static constexpr bool value = !hip_useBuiltinCommon<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
-
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
 
-
 /*!
  * Performs an atomic bitwise or using a builtin function. Stores the new value
  * in the given address and returns the old value.
@@ -104,56 +98,56 @@ using hip_useReinterpretCommon_t = typename hip_useReinterpretCommon<T>::type;
  * This overload using builtin functions is used to implement atomic loads
  * under some build configurations.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+template<typename T,
+         std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
   return ::atomicOr(acc, value);
 }
 
-
 /*!
  * Type trait for determining if the exchange operator should be implemented
  * using a builtin
  */
-template <typename T>
-struct hip_useBuiltinExchange {
-  static constexpr bool value =
-    std::is_same<T, int>::value ||
-    std::is_same<T, unsigned int>::value ||
-    std::is_same<T, unsigned long long>::value ||
-    std::is_same<T, float>::value;
+template<typename T>
+struct hip_useBuiltinExchange
+{
+  static constexpr bool value = std::is_same<T, int>::value ||
+                                std::is_same<T, unsigned int>::value ||
+                                std::is_same<T, unsigned long long>::value ||
+                                std::is_same<T, float>::value;
 };
 
 /*!
  * Type trait for determining if the exchange operator should be implemented
  * by reinterpreting inputs to types that the builtin exchange supports
  */
-template <typename T>
-struct hip_useReinterpretExchange {
-  static constexpr bool value =
-    !hip_useBuiltinExchange<T>::value &&
-    (sizeof(T) == sizeof(unsigned int) ||
-     sizeof(T) == sizeof(unsigned long long));
+template<typename T>
+struct hip_useReinterpretExchange
+{
+  static constexpr bool value = !hip_useBuiltinExchange<T>::value &&
+                                (sizeof(T) == sizeof(unsigned int) ||
+                                 sizeof(T) == sizeof(unsigned long long));
 
-  using type =
-    std::conditional_t<sizeof(T) == sizeof(unsigned int),
-                       unsigned int, unsigned long long>;
+  using type = std::conditional_t<sizeof(T) == sizeof(unsigned int),
+                                  unsigned int,
+                                  unsigned long long>;
 };
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
-using hip_useReinterpretExchange_t = typename hip_useReinterpretExchange<T>::type;
+template<typename T>
+using hip_useReinterpretExchange_t =
+    typename hip_useReinterpretExchange<T>::type;
 
 /*!
  * Performs an atomic exchange using a builtin function. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<hip_useBuiltinExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   return ::atomicExch(acc, value);
 }
@@ -162,110 +156,109 @@ RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
  * Performs an atomic exchange using a reinterpret cast. Stores the new value
  * in the given address and returns the old value.
  */
-template <typename T,
-          std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicExchange(T *acc, T value)
+template<typename T,
+         std::enable_if_t<hip_useReinterpretExchange<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicExchange(T* acc, T value)
 {
   using R = hip_useReinterpretExchange_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicExchange(reinterpret_cast<R*>(acc),
-                       RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicExchange(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
 
-#if defined(__has_builtin) && \
+#if defined(__has_builtin) &&                                                  \
     (__has_builtin(__hip_atomic_load) || __has_builtin(__hip_atomic_store))
 
 /*!
  * Type trait for determining if the operator should be implemented
  * using an intrinsic
  */
-template <typename T>
-struct hip_useBuiltinLoad {
+template<typename T>
+struct hip_useBuiltinLoad
+{
   static constexpr bool value =
-    (std::is_integral<T>::value || std::is_enum<T>::value) &&
-    (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
+      (std::is_integral<T>::value || std::is_enum<T>::value) &&
+      (sizeof(T) == 1 || sizeof(T) == 2 || sizeof(T) == 4 || sizeof(T) == 8);
 };
 
-template <typename T>
+template<typename T>
 using hip_useBuiltinStore = hip_useBuiltinLoad<T>;
 
-
 /*!
  * Type trait for determining if the operator should be implemented
  * by reinterpreting inputs to types that intrinsics support
  */
-template <typename T>
-struct hip_useReinterpretLoad {
-  static constexpr bool value =
-    !std::is_integral<T>::value &&
-    !std::is_enum<T>::value &&
-    ((sizeof(T) == 1
+template<typename T>
+struct hip_useReinterpretLoad
+{
+  static constexpr bool value = !std::is_integral<T>::value &&
+                                !std::is_enum<T>::value &&
+                                ((sizeof(T) == 1
 #if !defined(UINT8_MAX)
-      && sizeof(unsigned char) == 1
+                                  && sizeof(unsigned char) == 1
 #endif
-     ) ||
-     (sizeof(T) == 2
+                                  ) ||
+                                 (sizeof(T) == 2
 #if !defined(UINT16_MAX)
-      && sizeof(unsigned short) == 2
+                                  && sizeof(unsigned short) == 2
 #endif
-     ) ||
-     (sizeof(T) == 4
+                                  ) ||
+                                 (sizeof(T) == 4
 #if !defined(UINT32_MAX)
-      && sizeof(unsigned int) == 4
+                                  && sizeof(unsigned int) == 4
 #endif
-     ) ||
-     (sizeof(T) == 8
+                                  ) ||
+                                 (sizeof(T) == 8
 #if !defined(UINT64_MAX)
-      && sizeof(unsigned long long) == 8
+                                  && sizeof(unsigned long long) == 8
 #endif
-     ));
+                                  ));
 
   using type =
-    std::conditional_t<sizeof(T) == 1,
+      std::conditional_t<sizeof(T) == 1,
 #if defined(UINT8_MAX)
-                       uint8_t,
+                         uint8_t,
 #else
-                       unsigned char,
+                         unsigned char,
 #endif
-    std::conditional_t<sizeof(T) == 2,
+                         std::conditional_t<sizeof(T) == 2,
 #if defined(UINT16_MAX)
-                       uint16_t,
+                                            uint16_t,
 #else
-                       unsigned short,
+                                            unsigned short,
 #endif
-    std::conditional_t<sizeof(T) == 4,
+                                            std::conditional_t<sizeof(T) == 4,
 #if defined(UINT32_MAX)
-                       uint32_t,
+                                                               uint32_t,
 #else
-                       unsigned int,
+                                                               unsigned int,
 #endif
 #if defined(UINT64_MAX)
-                       uint64_t>>>;
+                                                               uint64_t>>>;
 #else
-                       unsigned long long>>>;
+                                                               unsigned long long>>>;
 #endif
 };
 
-template <typename T>
+template<typename T>
 using hip_useReinterpretStore = hip_useReinterpretLoad<T>;
 
 #else
 
-template <typename T>
+template<typename T>
 using hip_useBuiltinLoad = hip_useBuiltinCommon<T>;
 
-template <typename T>
+template<typename T>
 using hip_useBuiltinStore = hip_useBuiltinExchange<T>;
 
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using hip_useReinterpretLoad = hip_useReinterpretCommon<T>;
 
-template <typename T>
+template<typename T>
 using hip_useReinterpretStore = hip_useReinterpretExchange<T>;
 
 #endif
@@ -273,19 +266,18 @@ using hip_useReinterpretStore = hip_useReinterpretExchange<T>;
 /*!
  * Alias for determining the integral type of the same size as the given type
  */
-template <typename T>
+template<typename T>
 using hip_useReinterpretLoad_t = typename hip_useReinterpretLoad<T>::type;
 
-template <typename T>
+template<typename T>
 using hip_useReinterpretStore_t = typename hip_useReinterpretStore<T>::type;
 
-
 /*!
  * Atomic load
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<hip_useBuiltinLoad<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_load)
   return __hip_atomic_load(acc, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -294,23 +286,22 @@ RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
 #endif
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicLoad(T *acc)
+template<typename T,
+         std::enable_if_t<hip_useReinterpretLoad<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicLoad(T* acc)
 {
   using R = hip_useReinterpretLoad_t<T>;
 
   return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicLoad(reinterpret_cast<R*>(acc)));
+      hip_atomicLoad(reinterpret_cast<R*>(acc)));
 }
 
-
 /*!
  * Atomic store
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+template<typename T,
+         std::enable_if_t<hip_useBuiltinStore<T>::value, bool> = true>
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
 #if defined(__has_builtin) && __has_builtin(__hip_atomic_store)
   __hip_atomic_store(acc, value, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -319,9 +310,9 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
 #endif
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
-RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
+template<typename T,
+         std::enable_if_t<hip_useReinterpretStore<T>::value, bool> = true>
+RAJA_INLINE __device__ void hip_atomicStore(T* acc, T value)
 {
   using R = hip_useReinterpretStore_t<T>;
 
@@ -329,15 +320,14 @@ RAJA_INLINE __device__ void hip_atomicStore(T *acc, T value)
                   RAJA::util::reinterp_A_as_B<T, R>(value));
 }
 
-
 /*!
  * Hip atomicCAS using builtin function
  *
  * Returns the old value in memory before this operation.
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   return ::atomicCAS(acc, compare, value);
 }
@@ -347,33 +337,31 @@ RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
  *
  * Returns the old value in memory before this operation.
  */
-template <typename T,
-          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
-RAJA_INLINE __device__ T hip_atomicCAS(T *acc, T compare, T value)
+template<typename T,
+         std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+RAJA_INLINE __device__ T hip_atomicCAS(T* acc, T compare, T value)
 {
   using R = hip_useReinterpretCommon_t<T>;
 
-  return RAJA::util::reinterp_A_as_B<R, T>(
-    hip_atomicCAS(reinterpret_cast<R*>(acc),
-                  RAJA::util::reinterp_A_as_B<T, R>(compare),
-                  RAJA::util::reinterp_A_as_B<T, R>(value)));
+  return RAJA::util::reinterp_A_as_B<R, T>(hip_atomicCAS(
+      reinterpret_cast<R*>(acc), RAJA::util::reinterp_A_as_B<T, R>(compare),
+      RAJA::util::reinterp_A_as_B<T, R>(value)));
 }
 
-
 /*!
  * Equality comparison for compare and swap loop. Converts to the underlying
  * integral type to avoid cases where the values will never compare equal
  * (most notably, NaNs).
  */
-template <typename T,
-          std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
+template<typename T,
+         std::enable_if_t<hip_useBuiltinCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   return a == b;
 }
 
-template <typename T,
-          std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
+template<typename T,
+         std::enable_if_t<hip_useReinterpretCommon<T>::value, bool> = true>
 RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
 {
   using R = hip_useReinterpretCommon_t<T>;
@@ -382,57 +370,56 @@ RAJA_INLINE __device__ bool hip_atomicCAS_equal(const T& a, const T& b)
                              RAJA::util::reinterp_A_as_B<T, R>(b));
 }
 
-
 /*!
  * Generic impementation of any atomic 32-bit or 64-bit operator.
  * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
  * operator. Returns the OLD value that was replaced by the result of this
  * operation.
  */
-template <typename T, typename Oper>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
-                                            Oper&& oper)
+template<typename T, typename Oper>
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc, Oper&& oper)
 {
   T old = hip_atomicLoad(acc);
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected));
 
   return old;
 }
 
-
 /*!
- * Generic impementation of any atomic 32-bit or 64-bit operator with short-circuiting.
- * Implementation uses the existing HIP supplied unsigned 32-bit or 64-bit CAS
- * operator. Returns the OLD value that was replaced by the result of this
- * operation.
+ * Generic impementation of any atomic 32-bit or 64-bit operator with
+ * short-circuiting. Implementation uses the existing HIP supplied unsigned
+ * 32-bit or 64-bit CAS operator. Returns the OLD value that was replaced by the
+ * result of this operation.
  */
-template <typename T, typename Oper, typename ShortCircuit>
-RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
+template<typename T, typename Oper, typename ShortCircuit>
+RAJA_INLINE __device__ T hip_atomicCAS_loop(T* acc,
                                             Oper&& oper,
                                             ShortCircuit&& sc)
 {
   T old = hip_atomicLoad(acc);
 
-  if (sc(old)) {
+  if (sc(old))
+  {
     return old;
   }
 
   T expected;
 
-  do {
+  do
+  {
     expected = old;
-    old = hip_atomicCAS(acc, expected, oper(expected));
+    old      = hip_atomicCAS(acc, expected, oper(expected));
   } while (!hip_atomicCAS_equal(old, expected) && !sc(old));
 
   return old;
 }
 
-
 /*!
  * Atomic addition
  */
@@ -440,34 +427,34 @@ RAJA_INLINE __device__ T hip_atomicCAS_loop(T *acc,
 /*!
  * List of types where HIP builtin atomics are used to implement atomicAdd.
  */
-using hip_atomicAdd_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicAdd_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return old + value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAdd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAdd(T* acc, T value)
 {
   return ::atomicAdd(acc, value);
 }
 
-
 /*!
  * Atomic subtraction
  */
@@ -475,16 +462,15 @@ RAJA_INLINE __device__ T hip_atomicAdd(T *acc, T value)
 /*!
  * List of types where HIP builtin atomics are used to implement atomicSub.
  */
-using hip_atomicSub_builtin_types = ::camp::list<
-  int,
-  unsigned int,
-  unsigned long long,
-  float
+using hip_atomicSub_builtin_types = ::camp::list<int,
+                                                 unsigned int,
+                                                 unsigned long long,
+                                                 float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                 ,
+                                                 double
 #endif
->;
+                                                 >;
 
 /*!
  * List of types where HIP builtin atomicSub is used to implement atomicSub.
@@ -492,10 +478,7 @@ using hip_atomicSub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
-  int,
-  unsigned int
->;
+using hip_atomicSub_via_Sub_builtin_types = ::camp::list<int, unsigned int>;
 
 /*!
  * List of types where HIP builtin atomicAdd is used to implement atomicSub.
@@ -503,23 +486,23 @@ using hip_atomicSub_via_Sub_builtin_types = ::camp::list<
  * Avoid multiple definition errors by including the previous list type here
  * to ensure these lists have different types.
  */
-using hip_atomicSub_via_Add_builtin_types = ::camp::list<
-  unsigned long long,
-  float
+using hip_atomicSub_via_Add_builtin_types = ::camp::list<unsigned long long,
+                                                         float
 #ifdef RAJA_ENABLE_HIP_DOUBLE_ATOMICADD
-  ,
-  double
+                                                         ,
+                                                         double
 #endif
->;
+                                                         >;
 
 /*!
  * HIP atomicSub compare and swap loop implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicSub_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return old - value;
   });
 }
@@ -527,9 +510,11 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Sub_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicSub(acc, value);
 }
@@ -537,144 +522,146 @@ RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
 /*!
  * HIP atomicSub via atomicAdd builtin implementation.
  */
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicSub(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicSub_via_Add_builtin_types>* =
+        nullptr>
+RAJA_INLINE __device__ T hip_atomicSub(T* acc, T value)
 {
   return ::atomicAdd(acc, -value);
 }
 
-
 /*!
  * Atomic minimum
  */
 using hip_atomicMin_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return value < old ? value : old;
-    },
-    [value] (T current) {
-      return current <= value;
-    });
+      acc,
+      [value](T old) {
+        return value < old ? value : old;
+      },
+      [value](T current) {
+        return current <= value;
+      });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMin(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMin_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMin(T* acc, T value)
 {
   return ::atomicMin(acc, value);
 }
 
-
 /*!
  * Atomic maximum
  */
 using hip_atomicMax_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return hip_atomicCAS_loop(
-    acc,
-    [value] (T old) {
-      return old < value ? value : old;
-    },
-    [value] (T current) {
-      return value <= current;
-    });
+      acc,
+      [value](T old) {
+        return old < value ? value : old;
+      },
+      [value](T current) {
+        return value <= current;
+      });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicMax(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicMax_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicMax(T* acc, T value)
 {
   return ::atomicMax(acc, value);
 }
 
-
 /*!
  * Atomic increment with reset
  */
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc, T value)
+template<typename T>
+RAJA_INLINE __device__ T hip_atomicInc(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return value <= old ? static_cast<T>(0) : old + static_cast<T>(1);
   });
 }
 
-
 /*!
  * Atomic increment (implemented in terms of atomic addition)
  */
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicInc(T *acc)
+template<typename T>
+RAJA_INLINE __device__ T hip_atomicInc(T* acc)
 {
   return hip_atomicAdd(acc, static_cast<T>(1));
 }
 
-
 /*!
  * Atomic decrement with reset
  */
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc, T value)
+template<typename T>
+RAJA_INLINE __device__ T hip_atomicDec(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
-    return old == static_cast<T>(0) || value < old ? value : old - static_cast<T>(1);
+  return hip_atomicCAS_loop(acc, [value](T old) {
+    return old == static_cast<T>(0) || value < old ? value
+                                                   : old - static_cast<T>(1);
   });
 }
 
-
 /*!
  * Atomic decrement (implemented in terms of atomic subtraction)
  */
-template <typename T>
-RAJA_INLINE __device__ T hip_atomicDec(T *acc)
+template<typename T>
+RAJA_INLINE __device__ T hip_atomicDec(T* acc)
 {
   return hip_atomicSub(acc, static_cast<T>(1));
 }
 
-
 /*!
  * Atomic and
  */
 using hip_atomicAnd_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return old & value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicAnd(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicAnd_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicAnd(T* acc, T value)
 {
   return ::atomicAnd(acc, value);
 }
 
-
 /*!
  * Atomic or
  */
 using hip_atomicOr_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicOr_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicOr(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return old | value;
   });
 }
@@ -690,18 +677,20 @@ RAJA_INLINE __device__ T hip_atomicOr(T *acc, T value)
  */
 using hip_atomicXor_builtin_types = hip_atomicCommon_builtin_types;
 
-template <typename T,
-          RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_none_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
-  return hip_atomicCAS_loop(acc, [value] (T old) {
+  return hip_atomicCAS_loop(acc, [value](T old) {
     return old ^ value;
   });
 }
 
-template <typename T,
-          RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
-RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
+template<
+    typename T,
+    RAJA::util::enable_if_is_any_of<T, hip_atomicXor_builtin_types>* = nullptr>
+RAJA_INLINE __device__ T hip_atomicXor(T* acc, T value)
 {
   return ::atomicXor(acc, value);
 }
@@ -709,7 +698,6 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
 
 }  // namespace detail
 
-
 /*!
  * Catch-all policy passes off to HIP's builtin atomics.
  *
@@ -720,182 +708,193 @@ RAJA_INLINE __device__ T hip_atomicXor(T *acc, T value)
  */
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicLoad(hip_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicLoad(hip_atomic_explicit<host_policy>,
+                                          T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicLoad(acc);
 #else
-  return RAJA::atomicLoad(host_policy{}, acc);
+  return RAJA::atomicLoad(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE void
-atomicStore(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE void atomicStore(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   detail::hip_atomicStore(acc, value);
 #else
-  RAJA::atomicStore(host_policy{}, acc, value);
+  RAJA::atomicStore(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAdd(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAdd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAdd(acc, value);
 #else
-  return RAJA::atomicAdd(host_policy{}, acc, value);
+  return RAJA::atomicAdd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicSub(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicSub(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicSub(acc, value);
 #else
-  return RAJA::atomicSub(host_policy{}, acc, value);
+  return RAJA::atomicSub(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMin(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMin(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMin(acc, value);
 #else
-  return RAJA::atomicMin(host_policy{}, acc, value);
+  return RAJA::atomicMin(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicMax(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicMax(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicMax(acc, value);
 #else
-  return RAJA::atomicMax(host_policy{}, acc, value);
+  return RAJA::atomicMax(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc, value);
 #else
-  return RAJA::atomicInc(host_policy{}, acc, value);
+  return RAJA::atomicInc(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicInc(hip_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicInc(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicInc(acc);
 #else
-  return RAJA::atomicInc(host_policy{}, acc);
+  return RAJA::atomicInc(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc, value);
 #else
-  return RAJA::atomicDec(host_policy{}, acc, value);
+  return RAJA::atomicDec(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicDec(hip_atomic_explicit<host_policy>, T *acc)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicDec(hip_atomic_explicit<host_policy>,
+                                         T* acc)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicDec(acc);
 #else
-  return RAJA::atomicDec(host_policy{}, acc);
+  return RAJA::atomicDec(host_policy {}, acc);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicAnd(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicAnd(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicAnd(acc, value);
 #else
-  return RAJA::atomicAnd(host_policy{}, acc, value);
+  return RAJA::atomicAnd(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicOr(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicOr(hip_atomic_explicit<host_policy>,
+                                        T* acc,
+                                        T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicOr(acc, value);
 #else
-  return RAJA::atomicOr(host_policy{}, acc, value);
+  return RAJA::atomicOr(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicXor(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicXor(hip_atomic_explicit<host_policy>,
+                                         T* acc,
+                                         T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicXor(acc, value);
 #else
-  return RAJA::atomicXor(host_policy{}, acc, value);
+  return RAJA::atomicXor(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
-RAJA_INLINE RAJA_HOST_DEVICE T
-atomicExchange(hip_atomic_explicit<host_policy>, T *acc, T value)
+template<typename T, typename host_policy>
+RAJA_INLINE RAJA_HOST_DEVICE T atomicExchange(hip_atomic_explicit<host_policy>,
+                                              T* acc,
+                                              T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicExchange(acc, value);
 #else
-  return RAJA::atomicExchange(host_policy{}, acc, value);
+  return RAJA::atomicExchange(host_policy {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T, typename host_policy>
+template<typename T, typename host_policy>
 RAJA_INLINE RAJA_HOST_DEVICE T
-atomicCAS(hip_atomic_explicit<host_policy>, T *acc, T compare, T value)
+atomicCAS(hip_atomic_explicit<host_policy>, T* acc, T compare, T value)
 {
 #if defined(__HIP_DEVICE_COMPILE__)
   return detail::hip_atomicCAS(acc, compare, value);
 #else
-  return RAJA::atomicCAS(host_policy{}, acc, compare, value);
+  return RAJA::atomicCAS(host_policy {}, acc, compare, value);
 #endif
 }
 
diff --git a/include/RAJA/policy/hip/forall.hpp b/include/RAJA/policy/hip/forall.hpp
index a8c4cf53b9..3808387f6e 100644
--- a/include/RAJA/policy/hip/forall.hpp
+++ b/include/RAJA/policy/hip/forall.hpp
@@ -71,61 +71,91 @@ namespace impl
  *
  ******************************************************************************
  */
-template<typename IterationMapping, typename IterationGetter, typename Concretizer, typename UniqueMarker>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         typename UniqueMarker>
 struct ForallDimensionCalculator;
 
 // The general cases handle fixed BLOCK_SIZE > 0 and/or GRID_SIZE > 0
 // there are specializations for named_usage::unspecified
 // but named_usage::ignored is not supported so no specializations are provided
 // and static_asserts in the general case catch unsupported values
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
-    if ( len > (block_size * grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (block_size * grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
-    internal::set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexGetter::block_size));
-    internal::set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexGetter::grid_size));
+    internal::set_hip_dim<dim>(dims.threads,
+                               static_cast<IdxT>(IndexGetter::block_size));
+    internal::set_hip_dim<dim>(dims.blocks,
+                               static_cast<IdxT>(IndexGetter::grid_size));
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_len(grid_size);
 
-    if ( block_size == IdxT(0) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (block_size == IdxT(0))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -133,24 +163,34 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_len(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_len(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
@@ -158,18 +198,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
 };
 
 template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::Direct,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_len();
 
@@ -178,46 +224,67 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::Direct,
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
 
   using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT RAJA_UNUSED_ARG(len),
-                             const void* RAJA_UNUSED_ARG(func), size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT RAJA_UNUSED_ARG(len),
+                             const void* RAJA_UNUSED_ARG(func),
+                             size_t RAJA_UNUSED_ARG(dynamic_shmem_size))
   {
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
   }
 };
 
-template<named_dim dim, int GRID_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int GRID_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      GRID_SIZE > 0,
+      "grid size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
-    const IdxT grid_size = static_cast<IdxT>(IndexGetter::grid_size);
+    const IdxT grid_size  = static_cast<IdxT>(IndexGetter::grid_size);
     const IdxT block_size = concretizer.get_block_size_to_fit_device(grid_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
@@ -225,24 +292,34 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
   }
 };
 
-template<named_dim dim, int BLOCK_SIZE, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         typename Concretizer,
+         typename UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0 or named_usage::unspecified with forall");
-
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  static_assert(
+      BLOCK_SIZE > 0,
+      "block size must be > 0 or named_usage::unspecified with forall");
+
+  using IndexGetter =
+      ::RAJA::hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const IdxT block_size = static_cast<IdxT>(IndexGetter::block_size);
-    const IdxT grid_size = concretizer.get_grid_size_to_fit_device(block_size);
+    const IdxT grid_size  = concretizer.get_grid_size_to_fit_device(block_size);
 
     internal::set_hip_dim<dim>(dims.threads, block_size);
     internal::set_hip_dim<dim>(dims.blocks, grid_size);
@@ -250,18 +327,24 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
 };
 
 template<named_dim dim, typename Concretizer, typename UniqueMarker>
-struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                 ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
-                                 Concretizer,
-                                 UniqueMarker>
+struct ForallDimensionCalculator<
+    ::RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+    ::RAJA::hip::
+        IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>,
+    Concretizer,
+    UniqueMarker>
 {
-  using IndexGetter = ::RAJA::hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
-
-  template < typename IdxT >
-  static void set_dimensions(internal::HipDims& dims, IdxT len,
-                             const void* func, size_t dynamic_shmem_size)
+  using IndexGetter = ::RAJA::hip::
+      IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+
+  template<typename IdxT>
+  static void set_dimensions(internal::HipDims& dims,
+                             IdxT len,
+                             const void* func,
+                             size_t dynamic_shmem_size)
   {
-    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer{func, dynamic_shmem_size, len};
+    ::RAJA::hip::ConcretizerImpl<IdxT, Concretizer, UniqueMarker> concretizer {
+        func, dynamic_shmem_size, len};
 
     const auto sizes = concretizer.get_block_and_grid_size_to_fit_device();
 
@@ -285,215 +368,227 @@ struct ForallDimensionCalculator<::RAJA::iteration_mapping::StridedLoop<named_us
  *
  ******************************************************************************
  */
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<typename EXEC_POL,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size > 0),
+                          size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
+
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template<typename EXEC_POL,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size <= 0),
+                          size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_hip_kernel(LOOP_BODY loop_body,
+                                  const Iterator idx,
+                                  IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if (ii < length) {
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
     body(idx[ii]);
   }
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<typename EXEC_POL,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename ForallParam,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size > 0),
+                          size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
+
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::DirectBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template<typename EXEC_POL,
+         typename Iterator,
+         typename LOOP_BODY,
+         typename IndexType,
+         typename ForallParam,
+         typename IterationMapping = typename EXEC_POL::IterationMapping,
+         typename IterationGetter  = typename EXEC_POL::IterationGetter,
+         std::enable_if_t<std::is_base_of<iteration_mapping::DirectBase,
+                                          IterationMapping>::value &&
+                              (IterationGetter::block_size <= 0),
+                          size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  auto ii = IterationGetter::template index<IndexType>();
-  if ( ii < length ) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  auto ii         = IterationGetter::template index<IndexType>();
+  if (ii < length)
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
 
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+    void forall_hip_kernel(LOOP_BODY loop_body,
+                           const Iterator idx,
+                           IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
+
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forall_hip_kernel(LOOP_BODY loop_body,
-                       const Iterator idx,
-                       IndexType length)
+template<
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forall_hip_kernel(LOOP_BODY loop_body,
+                                  const Iterator idx,
+                                  IndexType length)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
     body(idx[ii]);
   }
 }
 
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size > 0),
-              size_t > BlockSize = IterationGetter::block_size>
+template<
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping          = typename EXEC_POL::IterationMapping,
+    typename IterationGetter           = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size > 0),
+                     size_t> BlockSize = IterationGetter::block_size>
 __launch_bounds__(BlockSize, 1) __global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+    void forallp_hip_kernel(LOOP_BODY loop_body,
+                            const Iterator idx,
+                            IndexType length,
+                            ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
+
 ///
-template <typename EXEC_POL,
-          typename Iterator,
-          typename LOOP_BODY,
-          typename IndexType,
-          typename ForallParam,
-          typename IterationMapping = typename EXEC_POL::IterationMapping,
-          typename IterationGetter = typename EXEC_POL::IterationGetter,
-          std::enable_if_t<
-                std::is_base_of<iteration_mapping::StridedLoopBase, IterationMapping>::value &&
-                std::is_base_of<iteration_mapping::UnsizedLoopBase, IterationMapping>::value &&
-                (IterationGetter::block_size <= 0),
-              size_t > RAJA_UNUSED_ARG(BlockSize) = 0>
-__global__
-void forallp_hip_kernel(LOOP_BODY loop_body,
-                        const Iterator idx,
-                        IndexType length,
-                        ForallParam f_params)
+template<
+    typename EXEC_POL,
+    typename Iterator,
+    typename LOOP_BODY,
+    typename IndexType,
+    typename ForallParam,
+    typename IterationMapping = typename EXEC_POL::IterationMapping,
+    typename IterationGetter  = typename EXEC_POL::IterationGetter,
+    std::enable_if_t<std::is_base_of<iteration_mapping::StridedLoopBase,
+                                     IterationMapping>::value &&
+                         std::is_base_of<iteration_mapping::UnsizedLoopBase,
+                                         IterationMapping>::value &&
+                         (IterationGetter::block_size <= 0),
+                     size_t> RAJA_UNUSED_ARG(BlockSize) = 0>
+__global__ void forallp_hip_kernel(LOOP_BODY loop_body,
+                                   const Iterator idx,
+                                   IndexType length,
+                                   ForallParam f_params)
 {
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(loop_body);
-  auto& body = privatizer.get_priv();
-  for (auto ii = IterationGetter::template index<IndexType>();
-       ii < length;
-       ii += IterationGetter::template size<IndexType>()) {
-    RAJA::expt::invoke_body( f_params, body, idx[ii] );
+  auto& body      = privatizer.get_priv();
+  for (auto ii = IterationGetter::template index<IndexType>(); ii < length;
+       ii += IterationGetter::template size<IndexType>())
+  {
+    RAJA::expt::invoke_body(f_params, body, idx[ii]);
   }
   RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params);
 }
@@ -508,37 +603,48 @@ void forallp_hip_kernel(LOOP_BODY loop_body,
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+template<typename Iterable,
+         typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forall_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType>);
@@ -560,14 +666,16 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
     }
 
     RAJA_FT_END;
@@ -576,38 +684,49 @@ forall_impl(resources::Hip hip_res,
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-
-template <typename Iterable, typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Hip>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate< RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Hip hip_res,
-            ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam f_params)
+template<typename Iterable,
+         typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(
+    resources::Hip hip_res,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> const&,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam f_params)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
-  using EXEC_POL = ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>;
-  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter, LOOP_BODY, Iterator, ForallParam>;
-  using DimensionCalculator = impl::ForallDimensionCalculator<IterationMapping, IterationGetter, Concretizer, UniqueMarker>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using EXEC_POL =
+      ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                    Concretizer, Async>;
+  using UniqueMarker = ::camp::list<IterationMapping, IterationGetter,
+                                    LOOP_BODY, Iterator, ForallParam>;
+  using DimensionCalculator =
+      impl::ForallDimensionCalculator<IterationMapping, IterationGetter,
+                                      Concretizer, UniqueMarker>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0) {
+  if (len > 0)
+  {
 
     auto func = reinterpret_cast<const void*>(
         &impl::forallp_hip_kernel<EXEC_POL, Iterator, LOOP_BODY, IndexType,
@@ -627,9 +746,9 @@ forall_impl(resources::Hip hip_res,
     RAJA_FT_BEGIN;
 
     RAJA::hip::detail::hipInfo launch_info;
-    launch_info.gridDim = dims.blocks;
+    launch_info.gridDim  = dims.blocks;
     launch_info.blockDim = dims.threads;
-    launch_info.res = hip_res;
+    launch_info.res      = hip_res;
 
     {
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params, launch_info);
@@ -637,14 +756,17 @@ forall_impl(resources::Hip hip_res,
       //
       // Privatize the loop_body, using make_launch_body to setup reductions
       //
-      LOOP_BODY body = RAJA::hip::make_launch_body(func,
-          dims.blocks, dims.threads, shmem, hip_res, std::forward<LoopBody>(loop_body));
+      LOOP_BODY body = RAJA::hip::make_launch_body(
+          func, dims.blocks, dims.threads, shmem, hip_res,
+          std::forward<LoopBody>(loop_body));
 
       //
       // Launch the kernels
       //
-      void *args[] = {(void*)&body, (void*)&begin, (void*)&len, (void*)&f_params};
-      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res, Async);
+      void* args[] = {(void*)&body, (void*)&begin, (void*)&len,
+                      (void*)&f_params};
+      RAJA::hip::launch(func, dims.blocks, dims.threads, args, shmem, hip_res,
+                        Async);
 
       RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params, launch_info);
     }
@@ -655,7 +777,6 @@ forall_impl(resources::Hip hip_res,
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -674,23 +795,29 @@ forall_impl(resources::Hip hip_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Hip>
-forall_impl(resources::Hip r,
-            ExecPolicy<seq_segit, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
-            const TypedIndexSet<SegmentTypes...>& iset,
-            LoopBody&& loop_body)
+template<typename LoopBody,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename... SegmentTypes>
+RAJA_INLINE resources::EventProxy<resources::Hip> forall_impl(
+    resources::Hip r,
+    ExecPolicy<
+        seq_segit,
+        ::RAJA::policy::hip::
+            hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, true>(),
-                     loop_body);
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(
+        r, isi, detail::CallForall(),
+        ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter,
+                                      Concretizer, true>(),
+        loop_body);
   }  // iterate over segments of index set
 
   if (!Async) RAJA::hip::synchronize(r);
diff --git a/include/RAJA/policy/hip/intrinsics.hpp b/include/RAJA/policy/hip/intrinsics.hpp
index c72a0b5c4f..d106329ad0 100644
--- a/include/RAJA/policy/hip/intrinsics.hpp
+++ b/include/RAJA/policy/hip/intrinsics.hpp
@@ -35,7 +35,6 @@
 
 #include "RAJA/policy/hip/policy.hpp"
 
-
 namespace RAJA
 {
 
@@ -59,15 +58,9 @@ namespace impl
  */
 struct AccessorDeviceScopeUseDeviceFence : RAJA::detail::DefaultAccessor
 {
-  static RAJA_DEVICE RAJA_INLINE void fence_acquire()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_acquire() { __threadfence(); }
 
-  static RAJA_DEVICE RAJA_INLINE void fence_release()
-  {
-    __threadfence();
-  }
+  static RAJA_DEVICE RAJA_INLINE void fence_release() { __threadfence(); }
 };
 
 /*!
@@ -96,18 +89,23 @@ struct AccessorDeviceScopeUseBlockFence
   static constexpr size_t min_atomic_int_type_size = sizeof(unsigned int);
   static constexpr size_t max_atomic_int_type_size = sizeof(unsigned long long);
 
-  template < typename T >
+  template<typename T>
   static RAJA_DEVICE RAJA_INLINE T get(T* in_ptr, size_t idx)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
-    auto ptr = const_cast<integer_type*>(reinterpret_cast<const integer_type*>(in_ptr + idx));
-
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
-      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    auto ptr = const_cast<integer_type*>(
+        reinterpret_cast<const integer_type*>(in_ptr + idx));
+
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_load)
+      u.array[i] = __hip_atomic_load(&ptr[i], __ATOMIC_RELAXED,
+                                     __HIP_MEMORY_SCOPE_AGENT);
 #else
       u.array[i] = atomicAdd(&ptr[i], integer_type(0));
 #endif
@@ -116,19 +114,23 @@ struct AccessorDeviceScopeUseBlockFence
     return u.get_value();
   }
 
-  template < typename T >
+  template<typename T>
   static RAJA_DEVICE RAJA_INLINE void set(T* in_ptr, size_t idx, T val)
   {
-    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size, max_atomic_int_type_size>;
+    using ArrayType = RAJA::detail::AsIntegerArray<T, min_atomic_int_type_size,
+                                                   max_atomic_int_type_size>;
     using integer_type = typename ArrayType::integer_type;
 
     ArrayType u;
     u.set_value(val);
     auto ptr = reinterpret_cast<integer_type*>(in_ptr + idx);
 
-    for (size_t i = 0; i < u.array_size(); ++i) {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
-      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
+    for (size_t i = 0; i < u.array_size(); ++i)
+    {
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__hip_atomic_store)
+      __hip_atomic_store(&ptr[i], u.array[i], __ATOMIC_RELAXED,
+                         __HIP_MEMORY_SCOPE_AGENT);
 #else
       atomicExch(&ptr[i], u.array[i]);
 #endif
@@ -137,7 +139,8 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_acquire()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence)
     __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
 #else
     __threadfence();
@@ -146,18 +149,19 @@ struct AccessorDeviceScopeUseBlockFence
 
   static RAJA_DEVICE RAJA_INLINE void fence_release()
   {
-#if defined(RAJA_USE_HIP_INTRINSICS) && RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) && \
-                                        RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
+#if defined(RAJA_USE_HIP_INTRINSICS) &&                                        \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_fence) &&                 \
+    RAJA_INTERNAL_CLANG_HAS_BUILTIN(__builtin_amdgcn_s_waitcnt)
     __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
     // Wait until all vmem operations complete (s_waitcnt vmcnt(0))
-    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) | (/*lgkmcnt*/ 0xf << 8));
+    __builtin_amdgcn_s_waitcnt(/*vmcnt*/ 0 | (/*exp_cnt*/ 0x7 << 4) |
+                               (/*lgkmcnt*/ 0xf << 8));
 #else
     __threadfence();
 #endif
   }
 };
 
-
 // hip only has shfl primitives for 32 bits
 constexpr size_t min_shfl_int_type_size = sizeof(unsigned int);
 constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
@@ -172,58 +176,62 @@ constexpr size_t max_shfl_int_type_size = sizeof(unsigned int);
  *
  ******************************************************************************
  */
-template <typename T>
+template<typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_xor_sync(T var, int laneMask)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl_xor(u.array[i], laneMask);
   }
   return u.get_value();
 }
 
-template <typename T>
+template<typename T>
 RAJA_DEVICE RAJA_INLINE T shfl_sync(T var, int srcLane)
 {
-  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size, max_shfl_int_type_size> u;
+  RAJA::detail::AsIntegerArray<T, min_shfl_int_type_size,
+                               max_shfl_int_type_size>
+      u;
   u.set_value(var);
 
-  for (size_t i = 0; i < u.array_size(); ++i) {
+  for (size_t i = 0; i < u.array_size(); ++i)
+  {
     u.array[i] = ::__shfl(u.array[i], srcLane);
   }
   return u.get_value();
 }
 
-
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_xor_sync<int>(int var, int laneMask)
 {
   return ::__shfl_xor(var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_xor_sync<float>(float var, int laneMask)
 {
   return ::__shfl_xor(var, laneMask);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE int shfl_sync<int>(int var, int srcLane)
 {
   return ::__shfl(var, srcLane);
 }
 
-template <>
+template<>
 RAJA_DEVICE RAJA_INLINE float shfl_sync<float>(float var, int srcLane)
 {
   return ::__shfl(var, srcLane);
 }
 
-
 //! reduce values in block into thread 0
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 {
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -233,23 +241,28 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
@@ -264,22 +277,22 @@ RAJA_DEVICE RAJA_INLINE T warp_reduce(T val, T RAJA_UNUSED_ARG(identity))
  * This does a butterfly pattern leaving each lane with the full reduction
  *
  */
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T warp_allreduce(T val)
 {
   T temp = val;
 
-  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+  for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+  {
     T rhs = shfl_xor_sync(temp, i);
-    Combiner{}(temp, rhs);
+    Combiner {}(temp, rhs);
   }
 
   return temp;
 }
 
-
 //! reduce values in block into thread 0
-template <typename Combiner, typename T>
+template<typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
@@ -287,61 +300,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
 
-  int warpId = threadId % policy::hip::device_constants.WARP_SIZE;
+  int warpId  = threadId % policy::hip::device_constants.WARP_SIZE;
   int warpNum = threadId / policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = shfl_xor_sync(temp, i);
-      Combiner{}(temp, rhs);
+      Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = shfl_sync(temp, srcLane);
+      T rhs       = shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        Combiner {}(temp, rhs);
       }
     }
   }
 
   // reduce per warp values
-  if (numThreads > policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > policy::hip::device_constants.WARP_SIZE)
+  {
 
-    static_assert(policy::hip::device_constants.MAX_WARPS <= policy::hip::device_constants.WARP_SIZE,
-        "This algorithms assumes a warp of WARP_SIZE threads can reduce MAX_WARPS values");
+    static_assert(policy::hip::device_constants.MAX_WARPS <=
+                      policy::hip::device_constants.WARP_SIZE,
+                  "This algorithms assumes a warp of WARP_SIZE threads can "
+                  "reduce MAX_WARPS values");
 
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>)];
     RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS>* sd =
-      reinterpret_cast<RAJA::detail::SoAArray<T, policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+        reinterpret_cast<RAJA::detail::SoAArray<
+            T, policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = shfl_xor_sync(temp, i);
-        Combiner{}(temp, rhs);
+        Combiner {}(temp, rhs);
       }
     }
 
diff --git a/include/RAJA/policy/hip/kernel/Conditional.hpp b/include/RAJA/policy/hip/kernel/Conditional.hpp
index 3204845544..976115ccab 100644
--- a/include/RAJA/policy/hip/kernel/Conditional.hpp
+++ b/include/RAJA/policy/hip/kernel/Conditional.hpp
@@ -36,35 +36,29 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         typename Conditional,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<Data,
-                             statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                            statement::If<Conditional, EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, thread_active);
     }
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/hip/kernel/For.hpp b/include/RAJA/policy/hip/kernel/For.hpp
index 39e7104c16..55fc72bf1c 100644
--- a/include/RAJA/policy/hip/kernel/For.hpp
+++ b/include/RAJA/policy/hip/kernel/For.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -36,18 +35,21 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                  sync,
+                                                  IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -60,13 +62,13 @@ struct HipStatementExecutor<
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
   using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -79,14 +81,13 @@ struct HipStatementExecutor<
     enclosed_stmts_t::exec(data, thread_active && have_work);
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
     DimensionCalculator::set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -100,17 +101,21 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::sync,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -122,21 +127,23 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -151,14 +158,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -172,17 +178,21 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId.
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId,
-                   RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
+                   RAJA::policy::hip::hip_indexer<
+                       iteration_mapping::StridedLoop<named_usage::unspecified>,
+                       kernel_sync_requirement::none,
+                       IndexMapper>,
                    EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -194,21 +204,23 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = RAJA::internal::KernelDimensionCalculator<
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      RAJA::internal::KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -218,14 +230,13 @@ struct HipStatementExecutor<
     }
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -233,63 +244,65 @@ struct HipStatementExecutor<
   }
 };
 
-
 /*
  * Executor for sequential loops inside of a HipKernel.
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::For<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
-
+    : HipStatementExecutor<
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 /*
  * Executor for thread work sharing loop inside HipKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -299,13 +312,10 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -320,7 +330,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -329,44 +339,44 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_warp_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -381,10 +391,7 @@ struct HipStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims = enclosed_stmts_t::calculateDimensions(data);
@@ -399,7 +406,7 @@ struct HipStatementExecutor<
     // since we are direct-mapping, we REQUIRE len
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
-    return(dims);
+    return (dims);
   }
 };
 
@@ -408,33 +415,31 @@ struct HipStatementExecutor<
  * Mapping directly from raw threadIdx.x
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -444,13 +449,10 @@ struct HipStatementExecutor<
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -466,7 +468,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
@@ -475,42 +477,40 @@ struct HipStatementExecutor<
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<
-  Data,
-  statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                 EnclosedStmts ...>,
-  Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::hip_thread_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -525,10 +525,7 @@ struct HipStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Get enclosed statements
     LaunchDims dims;
@@ -544,7 +541,7 @@ struct HipStatementExecutor<
     set_hip_dim<named_dim::x>(dims.min_dims.threads, len);
 
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
diff --git a/include/RAJA/policy/hip/kernel/ForICount.hpp b/include/RAJA/policy/hip/kernel/ForICount.hpp
index ba6642f248..375536f0dd 100644
--- a/include/RAJA/policy/hip/kernel/ForICount.hpp
+++ b/include/RAJA/policy/hip/kernel/ForICount.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -37,42 +36,49 @@ namespace internal
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                     RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                    sync,
+                                                    IndexMapper>,
                      EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
     const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -94,46 +100,60 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -157,46 +177,60 @@ struct HipStatementExecutor<
  * Assigns the loop index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId,
-                         RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                         EnclosedStmts...>,
+    statement::ForICount<
+        ArgumentId,
+        ParamId,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
     Types>
     : public HipStatementExecutor<
-        Data,
-        statement::For<ArgumentId,
-                       RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                       EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::For<ArgumentId,
-                     RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                     EnclosedStmts...>,
+      statement::For<
+          ArgumentId,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // grid stride loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the index to the argument and param
       data.template assign_offset<ArgumentId>(i);
@@ -208,7 +242,6 @@ struct HipStatementExecutor<
   }
 };
 
-
 /*
  * Executor for sequential loops inside of a HipKernel.
  *
@@ -216,68 +249,79 @@ struct HipStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
     Types>
-: HipStatementExecutor<Data, statement::ForICount<ArgumentId,
-      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                     kernel_sync_requirement::none,
-                                     hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-      EnclosedStmts...>, Types>
-{
-
-};
-
+    : HipStatementExecutor<
+          Data,
+          statement::ForICount<
+              ArgumentId,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 /*
  * Executor for thread work sharing loop inside HipKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -288,63 +332,69 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside HipKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_warp_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_warp_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_warp_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_warp_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static_assert(mask_t::max_masked_size <= RAJA::policy::hip::device_constants.WARP_SIZE,
+  static_assert(mask_t::max_masked_size <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
                 "BitMask is too large for HIP warp size");
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -359,50 +409,54 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside HipKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_direct<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     const diff_t len = segment_length<ArgumentId>(data);
 
@@ -413,60 +467,65 @@ struct HipStatementExecutor<
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, thread_active && (i < len));
   }
-
 };
 
-
 /*
  * Executor for thread work sharing loop inside HipKernel.
  * Mapping directly from a warp lane
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::hip_thread_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public HipStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = HipStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public HipStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::hip_thread_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::hip_thread_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      HipStatementExecutor<Data,
+                           statement::For<ArgumentId,
+                                          RAJA::hip_thread_masked_loop<Mask>,
+                                          EnclosedStmts...>,
+                           Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // masked size strided loop
-    const diff_t len = segment_length<ArgumentId>(data);
-    const diff_t i_init = mask_t::maskValue((diff_t)threadIdx.x);
-    const diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    const diff_t len      = segment_length<ArgumentId>(data);
+    const diff_t i_init   = mask_t::maskValue((diff_t)threadIdx.x);
+    const diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -481,7 +540,6 @@ struct HipStatementExecutor<
       enclosed_stmts_t::exec(data, thread_active && have_work);
     }
   }
-
 };
 
 }  // namespace internal
diff --git a/include/RAJA/policy/hip/kernel/HipKernel.hpp b/include/RAJA/policy/hip/kernel/HipKernel.hpp
index 1ed7740008..494362a149 100644
--- a/include/RAJA/policy/hip/kernel/HipKernel.hpp
+++ b/include/RAJA/policy/hip/kernel/HipKernel.hpp
@@ -50,8 +50,9 @@ namespace RAJA
  * Num_blocks is chosen to maximize the number of blocks running concurrently.
  *
  */
-template <bool async0, int num_blocks, int num_threads>
-struct hip_explicit_launch {};
+template<bool async0, int num_blocks, int num_threads>
+struct hip_explicit_launch
+{};
 
 /*!
  * HIP kernel launch policy where the user specifies the number of physical
@@ -66,7 +67,7 @@ struct hip_explicit_launch {};
  * Num_threads is 1024, which may not be appropriate for all kernels.
  *
  */
-template <bool async0, int num_blocks, int num_threads>
+template<bool async0, int num_blocks, int num_threads>
 using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads>;
 
 /*!
@@ -74,7 +75,7 @@ using hip_launch = hip_explicit_launch<async0, num_blocks, num_threads>;
  * are determined by the HIP occupancy calculator.
  * If num_threads is 0 then num_threads is chosen at runtime.
  */
-template <int num_threads0, bool async0>
+template<int num_threads0, bool async0>
 using hip_occ_calc_launch = hip_explicit_launch<async0, 0, num_threads0>;
 
 namespace statement
@@ -85,11 +86,12 @@ namespace statement
  * Note - Statement requires a placeholder hip_exec policy for the sake of
  * object oriented inheritance.
  */
-template <typename LaunchConfig, typename... EnclosedStmts>
+template<typename LaunchConfig, typename... EnclosedStmts>
 struct HipKernelExt
-    : public internal::Statement<::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>, EnclosedStmts...> {
-};
-
+    : public internal::Statement<
+          ::RAJA::policy::hip::hip_exec<LaunchConfig, void, void, true>,
+          EnclosedStmts...>
+{};
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -97,9 +99,10 @@ struct HipKernelExt
  * calculator determine the unspecified values.
  * The kernel launch is synchronous.
  */
-template <int num_blocks, int num_threads, typename... EnclosedStmts>
+template<int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExp =
-    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<false, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with the flexibility
@@ -107,16 +110,17 @@ using HipKernelExp =
  * calculator determine the unspecified values.
  * The kernel launch is asynchronous.
  */
-template <int num_blocks, int num_threads, typename... EnclosedStmts>
+template<int num_blocks, int num_threads, typename... EnclosedStmts>
 using HipKernelExpAsync =
-    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>, EnclosedStmts...>;
+    HipKernelExt<hip_explicit_launch<true, num_blocks, num_threads>,
+                 EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel using the
  * HIP occupancy calculator to determine the optimal number of threads.
  * The kernel launch is synchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using HipKernelOcc =
     HipKernelExt<hip_occ_calc_launch<1024, false>, EnclosedStmts...>;
 
@@ -125,7 +129,7 @@ using HipKernelOcc =
  * HIP occupancy calculator to determine the optimal number of threads.
  * The kernel launch is asynchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using HipKernelOccAsync =
     HipKernelExt<hip_occ_calc_launch<1024, true>, EnclosedStmts...>;
 
@@ -134,32 +138,33 @@ using HipKernelOccAsync =
  * number of threads (specified by num_threads)
  * The kernel launch is synchronous.
  */
-template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixed =
-    HipKernelExt<hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
-                  EnclosedStmts...>;
+template<int num_threads, typename... EnclosedStmts>
+using HipKernelFixed = HipKernelExt<
+    hip_explicit_launch<false, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with a fixed
  * number of threads (specified by num_threads)
  * The kernel launch is asynchronous.
  */
-template <int num_threads, typename... EnclosedStmts>
-using HipKernelFixedAsync =
-    HipKernelExt<hip_explicit_launch<true, operators::limits<int>::max(), num_threads>, EnclosedStmts...>;
+template<int num_threads, typename... EnclosedStmts>
+using HipKernelFixedAsync = HipKernelExt<
+    hip_explicit_launch<true, operators::limits<int>::max(), num_threads>,
+    EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
  * The kernel launch is synchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using HipKernel = HipKernelFixed<1024, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a HIP kernel with 1024 threads
  * The kernel launch is asynchronous.
  */
-template <typename... EnclosedStmts>
+template<typename... EnclosedStmts>
 using HipKernelAsync = HipKernelFixedAsync<1024, EnclosedStmts...>;
 
 }  // namespace statement
@@ -171,17 +176,16 @@ namespace internal
 /*!
  * HIP global function for launching HipKernel policies
  */
-template <typename Data, typename Exec>
+template<typename Data, typename Exec>
 __global__ void HipKernelLauncher(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   Exec::exec(private_data, true);
 }
 
-
 /*!
  * HIP global function for launching HipKernel policies
  * This is annotated to guarantee that device code generated
@@ -189,19 +193,18 @@ __global__ void HipKernelLauncher(Data data)
  *
  * This launcher is used by the HipKerelFixed policies.
  */
-template <int BlockSize, typename Data, typename Exec>
+template<int BlockSize, typename Data, typename Exec>
 __launch_bounds__(BlockSize, 1) __global__
     void HipKernelLauncherFixed(Data data)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
   Exec::exec(private_data, true);
 }
 
-
 /*!
  * Helper class that handles getting the correct global function for
  * HipKernel policies. This class is specialized on whether or not BlockSize
@@ -213,7 +216,9 @@ __launch_bounds__(BlockSize, 1) __global__
 template<int BlockSize, typename Data, typename executor_t>
 struct HipKernelLauncherGetter
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+  using type = camp::decay<
+      decltype(&internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>)>;
+
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncherFixed<BlockSize, Data, executor_t>;
@@ -227,7 +232,9 @@ struct HipKernelLauncherGetter
 template<typename Data, typename executor_t>
 struct HipKernelLauncherGetter<0, Data, executor_t>
 {
-  using type = camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+  using type =
+      camp::decay<decltype(&internal::HipKernelLauncher<Data, executor_t>)>;
+
   static constexpr type get() noexcept
   {
     return &internal::HipKernelLauncher<Data, executor_t>;
@@ -235,30 +242,43 @@ struct HipKernelLauncherGetter<0, Data, executor_t>
 };
 
 
-
 /*!
  * Helper class that handles HIP kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template<typename LaunchPolicy,
+         typename StmtList,
+         typename Data,
+         typename Types>
 struct HipLaunchHelper;
 
-
 /*!
  * Helper class specialization to determine the number of threads and blocks.
  * The user may specify the number of threads and blocks or let one or both be
  * determined at runtime using the HIP occupancy calculator.
  */
-template<bool async0, int num_blocks, int num_threads, typename StmtList, typename Data, typename Types>
-struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,StmtList,Data,Types>
+template<bool async0,
+         int num_blocks,
+         int num_threads,
+         typename StmtList,
+         typename Data,
+         typename Types>
+struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,
+                       StmtList,
+                       Data,
+                       Types>
 {
   using Self = HipLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::hip_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::hip_statement_list_executor_t<StmtList, Data, Types>;
 
-  using kernelGetter_t = HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads, Data, executor_t>;
+  using kernelGetter_t =
+      HipKernelLauncherGetter<(num_threads <= 0) ? 0 : num_threads,
+                              Data,
+                              executor_t>;
 
   inline static const void* get_func()
   {
@@ -266,13 +286,16 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
   }
 
   inline static void recommended_blocks_threads(size_t shmem_size,
-      int &recommended_blocks, int &recommended_threads)
+                                                int& recommended_blocks,
+                                                int& recommended_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine blocks at runtime
@@ -280,10 +303,11 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         //
         auto data = ::RAJA::hip::hip_occupancy_max_blocks_threads<Self>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_device;
+        recommended_blocks  = data.func_max_blocks_per_device;
         recommended_threads = data.func_max_threads_per_block;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks at runtime
@@ -293,69 +317,74 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
 
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
-        recommended_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
+        recommended_blocks =
+            data.func_max_blocks_per_sm * data.device_sm_per_device;
       }
+    }
+    else
+    {
 
-    } else {
-
-      if (num_threads <= 0) {
+      if (num_threads <= 0)
+      {
 
         //
         // determine threads at runtime, unsure what use 1024
         // this value may be invalid for kernels with high register pressure
         //
         recommended_threads = 1024;
-
-      } else {
+      }
+      else
+      {
 
         //
         // threads determined at compile-time
         //
         recommended_threads = num_threads;
-
       }
 
       //
       // blocks determined at compile-time
       //
       recommended_blocks = num_blocks;
-
     }
   }
 
-  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size), int &max_threads)
+  inline static void max_threads(size_t RAJA_UNUSED_ARG(shmem_size),
+                                 int& max_threads)
   {
-    if (num_threads <= 0) {
+    if (num_threads <= 0)
+    {
 
       //
       // determine threads at runtime, unsure what use 1024
       // this value may be invalid for kernels with high register pressure
       //
       max_threads = 1024;
-
-    } else {
+    }
+    else
+    {
 
       //
       // threads determined at compile-time
       //
       max_threads = num_threads;
-
     }
   }
 
   inline static void max_blocks(size_t shmem_size,
-      int &max_blocks, int actual_threads)
+                                int& max_blocks,
+                                int actual_threads)
   {
     auto func = Self::get_func();
 
-    if (num_blocks <= 0) {
+    if (num_blocks <= 0)
+    {
 
       //
       // determine blocks at runtime
       //
-      if (num_threads <= 0 ||
-          num_threads != actual_threads) {
+      if (num_threads <= 0 || num_threads != actual_threads)
+      {
 
         //
         // determine blocks when actual_threads != num_threads
@@ -363,8 +392,9 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self>(
             func, shmem_size, actual_threads);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
-      } else {
+      }
+      else
+      {
 
         //
         // determine blocks when actual_threads == num_threads
@@ -372,16 +402,15 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
         auto data = ::RAJA::hip::hip_occupancy_max_blocks<Self, num_threads>(
             func, shmem_size);
         max_blocks = data.func_max_blocks_per_sm * data.device_sm_per_device;
-
       }
-
-    } else {
+    }
+    else
+    {
 
       //
       // blocks determined at compile-time
       //
       max_blocks = num_blocks;
-
     }
   }
 };
@@ -395,8 +424,10 @@ struct HipLaunchHelper<hip_explicit_launch<async0, num_blocks, num_threads>,Stmt
  * The algorithm is greedy (and probably could be improved), and favors
  * maximizing the number of threads (or blocks) in x, y, then z.
  */
-inline
-hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum = hip_dim_t()){
+inline hip_dim_t fitHipDims(hip_dim_member_t limit,
+                            hip_dim_t result,
+                            hip_dim_t minimum = hip_dim_t())
+{
 
 
   // clamp things to at least 1
@@ -409,12 +440,13 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   minimum.z = minimum.z ? minimum.z : 1;
 
   // if we are under the limit, we're done
-  if(result.x * result.y * result.z <= limit) return result;
+  if (result.x * result.y * result.z <= limit) return result;
 
   // Can we reduce z to fit?
-  if(result.x * result.y * minimum.z < limit){
+  if (result.x * result.y * minimum.z < limit)
+  {
     // compute a new z
-    result.z = limit / (result.x*result.y);
+    result.z = limit / (result.x * result.y);
     return result;
   }
   // we don't fit, so reduce z to it's minimum and continue on to y
@@ -422,9 +454,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(result.x * minimum.y * result.z < limit){
+  if (result.x * minimum.y * result.z < limit)
+  {
     // compute a new y
-    result.y = limit / (result.x*result.z);
+    result.y = limit / (result.x * result.z);
     return result;
   }
   // we don't fit, so reduce y to it's minimum and continue on to x
@@ -432,9 +465,10 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
 
 
   // Can we reduce y to fit?
-  if(minimum.x * result.y * result.z < limit){
+  if (minimum.x * result.y * result.z < limit)
+  {
     // compute a new x
-    result.x = limit / (result.y*result.z);
+    result.x = limit / (result.y * result.z);
     return result;
   }
   // we don't fit, so we'll return the smallest possible thing
@@ -443,24 +477,25 @@ hip_dim_t fitHipDims(hip_dim_member_t limit, hip_dim_t result, hip_dim_t minimum
   return result;
 }
 
-
 /*!
  * Specialization that launches HIP kernels for RAJA::kernel from host code
  */
-template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
+template<typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::HipKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
-  using StatementType =
-      statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
+  using stmt_list_t   = StatementList<EnclosedStmts...>;
+  using StatementType = statement::HipKernelExt<LaunchConfig, EnclosedStmts...>;
 
-  template <typename Data>
-  static inline void exec(Data &&data)
+  template<typename Data>
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        hip_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = HipLaunchHelper<LaunchConfig, stmt_list_t, data_t, Types>;
 
 
@@ -474,9 +509,10 @@ struct StatementExecutor<
 
 
     // Only launch kernel if we have something to iterate over
-    int num_blocks = launch_dims.num_blocks();
+    int num_blocks  = launch_dims.num_blocks();
     int num_threads = launch_dims.num_threads();
-    if (num_blocks > 0 || num_threads > 0) {
+    if (num_blocks > 0 || num_threads > 0)
+    {
 
       //
       // Setup shared memory buffers
@@ -489,8 +525,8 @@ struct StatementExecutor<
       //
       int recommended_blocks;
       int recommended_threads;
-      launch_t::recommended_blocks_threads(
-          shmem, recommended_blocks, recommended_threads);
+      launch_t::recommended_blocks_threads(shmem, recommended_blocks,
+                                           recommended_threads);
 
 
       //
@@ -503,24 +539,24 @@ struct StatementExecutor<
       //
       // Fit the requested threads
       //
-      hip_dim_t fit_threads{0,0,0};
-
-      if ( recommended_threads >= get_size(launch_dims.min_dims.threads) ) {
+      hip_dim_t fit_threads {0, 0, 0};
 
-        fit_threads = fitHipDims(
-            recommended_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads >= get_size(launch_dims.min_dims.threads))
+      {
 
+        fit_threads = fitHipDims(recommended_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       //
       // Redo fit with max threads
       //
-      if ( recommended_threads < max_threads &&
-           get_size(fit_threads) != recommended_threads ) {
-
-        fit_threads = fitHipDims(
-            max_threads, launch_dims.dims.threads, launch_dims.min_dims.threads);
+      if (recommended_threads < max_threads &&
+          get_size(fit_threads) != recommended_threads)
+      {
 
+        fit_threads = fitHipDims(max_threads, launch_dims.dims.threads,
+                                 launch_dims.min_dims.threads);
       }
 
       launch_dims.dims.threads = fit_threads;
@@ -534,24 +570,25 @@ struct StatementExecutor<
 
       int use_blocks;
 
-      if ( launch_dims.num_threads() == recommended_threads ) {
+      if (launch_dims.num_threads() == recommended_threads)
+      {
 
         //
         // Fit the requested blocks
         //
         use_blocks = recommended_blocks;
-
-      } else {
+      }
+      else
+      {
 
         //
         // Fit the max blocks
         //
         use_blocks = max_blocks;
-
       }
 
-      launch_dims.dims.blocks = fitHipDims(
-          use_blocks, launch_dims.dims.blocks, launch_dims.min_dims.blocks);
+      launch_dims.dims.blocks = fitHipDims(use_blocks, launch_dims.dims.blocks,
+                                           launch_dims.min_dims.blocks);
 
       //
       // make sure that we fit
@@ -560,7 +597,8 @@ struct StatementExecutor<
       if(launch_dims.num_blocks() > max_blocks){
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num blocks");
       }*/
-      if(launch_dims.num_threads() > max_threads){
+      if (launch_dims.num_threads() > max_threads)
+      {
         RAJA_ABORT_OR_THROW("RAJA::kernel exceeds max num threads");
       }
 
@@ -574,14 +612,17 @@ struct StatementExecutor<
         // of the launch_dims and potential changes to shmem here that is
         // currently an unresolved issue.
         //
-        auto hip_data = RAJA::hip::make_launch_body(func,
-            launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res, data);
+        auto hip_data = RAJA::hip::make_launch_body(
+            func, launch_dims.dims.blocks, launch_dims.dims.threads, shmem, res,
+            data);
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&hip_data};
-        RAJA::hip::launch(func, launch_dims.dims.blocks, launch_dims.dims.threads, args, shmem, res, launch_t::async);
+        void* args[] = {(void*)&hip_data};
+        RAJA::hip::launch(func, launch_dims.dims.blocks,
+                          launch_dims.dims.threads, args, shmem, res,
+                          launch_t::async);
       }
     }
   }
diff --git a/include/RAJA/policy/hip/kernel/Hyperplane.hpp b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
index 5c428f03ab..1c594fe2d8 100644
--- a/include/RAJA/policy/hip/kernel/Hyperplane.hpp
+++ b/include/RAJA/policy/hip/kernel/Hyperplane.hpp
@@ -36,38 +36,36 @@ namespace internal
 {
 
 
-template <typename Data,
-          camp::idx_t HpArgumentId,
-          camp::idx_t... Args,
-          typename... EnclosedStmts,
-          typename Types>
-struct HipStatementExecutor<Data,
-                             statement::Hyperplane<HpArgumentId,
-                                                   seq_exec,
-                                                   ArgList<Args...>,
-                                                   EnclosedStmts...>,
-                            Types> {
+template<typename Data,
+         camp::idx_t HpArgumentId,
+         camp::idx_t... Args,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<
+    Data,
+    statement::
+        Hyperplane<HpArgumentId, seq_exec, ArgList<Args...>, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, HpArgumentId, Data>;
 
-  using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
+  using enclosed_stmts_t =
+      HipStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // compute Manhattan distance of iteration space to determine
     // as:  hp_len = l0 + l1 + l2 + ...
-    int hp_len = segment_length<HpArgumentId>(data) +
-                 foldl(RAJA::operators::plus<int>(),
-                               segment_length<Args>(data)...);
+    int hp_len =
+        segment_length<HpArgumentId>(data) +
+        foldl(RAJA::operators::plus<int>(), segment_length<Args>(data)...);
 
     int h_args = foldl(RAJA::operators::plus<idx_t>(),
-        camp::get<Args>(data.offset_tuple)...);
+                       camp::get<Args>(data.offset_tuple)...);
 
     // get length of i dimension
     auto i_len = segment_length<HpArgumentId>(data);
@@ -79,7 +77,8 @@ struct HipStatementExecutor<Data,
      * We reject the iterations that lie outside of the specified rectangular
      * region we are sweeping.
      */
-    for (int h = 0; h < hp_len; ++h) {
+    for (int h = 0; h < hp_len; ++h)
+    {
 
       // compute actual iterate for HpArgumentId
       // as:  i0 = h - (i1 + i2 + i3 + ...)
@@ -92,19 +91,13 @@ struct HipStatementExecutor<Data,
     }
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
 };
 
 
-
-
 }  // end namespace internal
 
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
index bbb8d6081b..c2604b34f5 100644
--- a/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
+++ b/include/RAJA/policy/hip/kernel/InitLocalMem.hpp
@@ -39,27 +39,29 @@ struct hip_shared_mem;
 namespace internal
 {
 
-//Intialize thread shared array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
+// Intialize thread shared array
+template<typename Data,
+         camp::idx_t... Indices,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<Data,
-                            statement::InitLocalMem<RAJA::hip_shared_mem, camp::idx_seq<Indices...>,
-                            EnclosedStmts...>,
+                            statement::InitLocalMem<RAJA::hip_shared_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
                             Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  //Launch loops
+  // Launch loops
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -67,88 +69,77 @@ struct HipStatementExecutor<Data,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
+  // Intialize local array
+  // Identifies type + number of elements needed
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     __shared__ varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
+  // Set pointer to null base case
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
-
-  //Set pointer to null recursive case
+  // Set pointer to null recursive case
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
     setPtrToNull<other0, others...>(data);
   }
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
-
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
-//Intialize thread private array
-template <typename Data, camp::idx_t... Indices, typename... EnclosedStmts, typename Types>
-struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem, camp::idx_seq<Indices...>, EnclosedStmts...>, Types>
+// Intialize thread private array
+template<typename Data,
+         camp::idx_t... Indices,
+         typename... EnclosedStmts,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::InitLocalMem<RAJA::hip_thread_mem,
+                                                    camp::idx_seq<Indices...>,
+                                                    EnclosedStmts...>,
+                            Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  //Launch loops
+  // Launch loops
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
@@ -156,72 +147,55 @@ struct HipStatementExecutor<Data, statement::InitLocalMem<RAJA::hip_thread_mem,
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-  //Intialize local array
-  //Identifies type + number of elements needed
+  // Intialize local array
+  // Identifies type + number of elements needed
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void initMem(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void initMem(Data& data, bool thread_active)
   {
-    using varType = typename camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
-    const camp::idx_t NumElem = camp::tuple_element_t<Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
+    using varType = typename camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::value_type;
+    const camp::idx_t NumElem = camp::tuple_element_t<
+        Pos, typename camp::decay<Data>::param_tuple_t>::layout_type::s_size;
 
     varType Array[NumElem];
     camp::get<Pos>(data.param_tuple).set_data(&Array[0]);
     initMem<other0, others...>(data, thread_active);
   }
 
-  //Set pointer to null base case
+  // Set pointer to null base case
   template<camp::idx_t Pos>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
   }
 
-
-  //Set pointer to null recursive case
+  // Set pointer to null recursive case
   template<camp::idx_t Pos, camp::idx_t other0, camp::idx_t... others>
-  static
-  inline
-  RAJA_DEVICE
-  void setPtrToNull(Data &data)
+  static inline RAJA_DEVICE void setPtrToNull(Data& data)
   {
 
     camp::get<Pos>(data.param_tuple).set_data(nullptr);
     setPtrToNull<other0, others...>(data);
   }
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
 
-    //Intialize scoped arrays + launch loops
+    // Intialize scoped arrays + launch loops
     initMem<Indices...>(data, thread_active);
 
-    //set pointers in scoped arrays to null
+    // set pointers in scoped arrays to null
     setPtrToNull<Indices...>(data);
   }
 
-
-  inline
-  static
-  LaunchDims calculateDimensions(Data const &data)
+  inline static LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
-
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Lambda.hpp b/include/RAJA/policy/hip/kernel/Lambda.hpp
index d04fb11bf6..c98ae2e55e 100644
--- a/include/RAJA/policy/hip/kernel/Lambda.hpp
+++ b/include/RAJA/policy/hip/kernel/Lambda.hpp
@@ -34,36 +34,38 @@
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
 
-
 namespace RAJA
 {
 namespace internal
 {
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct HipStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template<typename Data,
+         camp::idx_t LambdaIndex,
+         typename... Args,
+         typename Types>
+struct HipStatementExecutor<Data,
+                            statement::Lambda<LambdaIndex, Args...>,
+                            Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
 
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Reduce.hpp b/include/RAJA/policy/hip/kernel/Reduce.hpp
index a518073e7c..3c9eeb3610 100644
--- a/include/RAJA/policy/hip/kernel/Reduce.hpp
+++ b/include/RAJA/policy/hip/kernel/Reduce.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/hip/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -34,32 +33,34 @@ namespace internal
 //
 // Executor that handles reductions across a single HIP thread block
 //
-template <typename Data,
-          template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         template<typename...>
+         class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_block_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                           Types> {
+                            statement::Reduce<RAJA::hip_block_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
@@ -67,21 +68,20 @@ struct HipStatementExecutor<Data,
     // reduction objects
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::block_reduce<combiner_t>(value, ident);
 
 
     // execute enclosed statements, and mask off everyone but thread 0
     thread_active = threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -89,57 +89,57 @@ struct HipStatementExecutor<Data,
   }
 };
 
-
 //
 // Executor that handles reductions across a single HIP thread warp
 //
-template <typename Data,
-          template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         template<typename...>
+         class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<Data,
-                             statement::Reduce<RAJA::hip_warp_reduce,
-                                               ReduceOperator,
-                                               ParamId,
-                                               EnclosedStmts...>,
-                            Types> {
+                            statement::Reduce<RAJA::hip_warp_reduce,
+                                              ReduceOperator,
+                                              ParamId,
+                                              EnclosedStmts...>,
+                            Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = HipStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static inline RAJA_DEVICE void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // block reduce on the specified parameter
-    auto value = data.template get_param<ParamId>();
+    auto value    = data.template get_param<ParamId>();
     using value_t = decltype(value);
     value_t ident = value_t();
 
     // if this thread isn't active, just set it to the identity
-    if (!thread_active) {
+    if (!thread_active)
+    {
       value = ident;
     }
 
     // Call warp reduction routine
     using combiner_t =
         RAJA::reduce::detail::op_adapter<value_t, ReduceOperator>;
-    value_t new_value =
-        RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
+    value_t new_value = RAJA::hip::impl::warp_reduce<combiner_t>(value, ident);
     data.template assign_param<ParamId>(new_value);
 
     // execute enclosed statements, and mask off everyone but lane 0
     thread_active = threadIdx.x == 0;
-    if(thread_active){
+    if (thread_active)
+    {
       // Only update to new value on root thread
       data.template assign_param<ParamId>(new_value);
     }
     enclosed_stmts_t::exec(data, thread_active);
   }
 
-
-  static inline LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // combine with enclosed statements
     LaunchDims enclosed_dims = enclosed_stmts_t::calculateDimensions(data);
@@ -148,7 +148,6 @@ struct HipStatementExecutor<Data,
 };
 
 
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/hip/kernel/Sync.hpp b/include/RAJA/policy/hip/kernel/Sync.hpp
index d54a5ccf83..c45ec4b186 100644
--- a/include/RAJA/policy/hip/kernel/Sync.hpp
+++ b/include/RAJA/policy/hip/kernel/Sync.hpp
@@ -34,7 +34,6 @@
 #include "RAJA/util/macros.hpp"
 #include "RAJA/util/types.hpp"
 
-
 namespace RAJA
 {
 namespace statement
@@ -43,49 +42,45 @@ namespace statement
 /*!
  * A RAJA::kernel statement that performs a HIP __syncthreads().
  */
-struct HipSyncThreads : public internal::Statement<camp::nil> {
-};
+struct HipSyncThreads : public internal::Statement<camp::nil>
+{};
 
 /*!
  * A RAJA::kernel statement that performs a HIP __syncwarp().
  */
-struct HipSyncWarp : public internal::Statement<camp::nil> {
-};
+struct HipSyncWarp : public internal::Statement<camp::nil>
+{};
 
 }  // namespace statement
 
 namespace internal
 {
 
-template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncThreads, Types> {
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &, bool) { __syncthreads(); }
+template<typename Data, typename Types>
+struct HipStatementExecutor<Data, statement::HipSyncThreads, Types>
+{
 
+  static inline RAJA_DEVICE void exec(Data&, bool) { __syncthreads(); }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
 };
 
-template <typename Data, typename Types>
-struct HipStatementExecutor<Data, statement::HipSyncWarp, Types> {
+template<typename Data, typename Types>
+struct HipStatementExecutor<Data, statement::HipSyncWarp, Types>
+{
 
-  static
-  inline
-  RAJA_DEVICE
-  //not currently supported
-  void exec(Data &, bool) {  }
+  static inline RAJA_DEVICE
+      // not currently supported
+      void
+      exec(Data&, bool)
+  {}
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/hip/kernel/Tile.hpp b/include/RAJA/policy/hip/kernel/Tile.hpp
index 62dda7f20d..5805370be0 100644
--- a/include/RAJA/policy/hip/kernel/Tile.hpp
+++ b/include/RAJA/policy/hip/kernel/Tile.hpp
@@ -47,21 +47,23 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
     statement::Tile<ArgumentId,
                     RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                   sync,
+                                                   IndexMapper>,
                     EnclosedStmts...>,
-                    Types>
-  {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -69,19 +71,21 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
+  using DimensionCalculator = KernelDimensionCalculator<
+      RAJA::policy::hip::
+          hip_indexer<iteration_mapping::Direct, sync, IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -100,23 +104,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, static_cast<diff_t>(chunk_size));
@@ -133,19 +137,24 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -153,26 +162,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::sync,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -190,23 +205,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -223,19 +238,24 @@ struct HipStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>, Types>
-  {
+    statement::Tile<
+        ArgumentId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -243,26 +263,32 @@ struct HipStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  using DimensionCalculator = KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>>;
+  using DimensionCalculator =
+      KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+          iteration_mapping::StridedLoop<named_usage::unspecified>,
+          kernel_sync_requirement::none,
+          IndexMapper>>;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(chunk_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -275,23 +301,23 @@ struct HipStatementExecutor<
     segment = orig_segment;
   }
 
-  static inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute how many chunks
     const diff_t full_len = segment_length<ArgumentId>(data);
-    const diff_t len = RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
+    const diff_t len =
+        RAJA_DIVIDE_CEILING_INT(full_len, static_cast<diff_t>(chunk_size));
 
     HipDims my_dims(0), my_min_dims(0);
-    DimensionCalculator{}.set_dimensions(my_dims, my_min_dims, len);
-    LaunchDims dims{my_dims, my_min_dims};
+    DimensionCalculator {}.set_dimensions(my_dims, my_min_dims, len);
+    LaunchDims dims {my_dims, my_min_dims};
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -303,28 +329,34 @@ struct HipStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel hip_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::Tile<ArgumentId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::Tile<
+              ArgumentId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/TileTCount.hpp b/include/RAJA/policy/hip/kernel/TileTCount.hpp
index 07637fbd8f..90e21bb46c 100644
--- a/include/RAJA/policy/hip/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/hip/kernel/TileTCount.hpp
@@ -48,52 +48,59 @@ namespace internal
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          kernel_sync_requirement sync,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         kernel_sync_requirement sync,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::
+            hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::
+                  hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
       statement::Tile<ArgumentId,
                       RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct, sync, IndexMapper>,
+                      RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
+                                                     sync,
+                                                     IndexMapper>,
                       EnclosedStmts...>,
-                      Types>;
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(chunk_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(chunk_size);
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -120,59 +127,73 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets all sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::sync,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::sync,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::sync, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::sync,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through in chunks
     // threads will have the same numbers of iterations
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       const diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -198,59 +219,73 @@ struct HipStatementExecutor<
  * Assigns the tile index to param ParamId
  * Meets no sync requirements
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          typename IndexMapper,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         typename IndexMapper,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<
+        ArgumentId,
+        ParamId,
+        RAJA::tile_fixed<chunk_size>,
+        RAJA::policy::hip::hip_indexer<
+            iteration_mapping::StridedLoop<named_usage::unspecified>,
+            kernel_sync_requirement::none,
+            IndexMapper>,
+        EnclosedStmts...>,
+    Types>
     : public HipStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<
+              ArgumentId,
+              RAJA::tile_fixed<chunk_size>,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  IndexMapper>,
+              EnclosedStmts...>,
+          Types>
+{
 
   using Base = HipStatementExecutor<
       Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>, kernel_sync_requirement::none, IndexMapper>,
-                      EnclosedStmts...>,
-                      Types>;
+      statement::Tile<
+          ArgumentId,
+          RAJA::tile_fixed<chunk_size>,
+          RAJA::policy::hip::hip_indexer<
+              iteration_mapping::StridedLoop<named_usage::unspecified>,
+              kernel_sync_requirement::none,
+              IndexMapper>,
+          EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static inline RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(chunk_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(chunk_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(chunk_size);
 
     // Iterate through one at a time
     // threads will have the different numbers of iterations
-    for(diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, static_cast<diff_t>(chunk_size));
@@ -265,29 +300,37 @@ struct HipStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel hip_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct HipStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
-: HipStatementExecutor<Data, statement::TileTCount<ArgumentId, ParamId, TPol,
-    RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                   kernel_sync_requirement::none,
-                                   hip::IndexGlobal<named_dim::x, named_usage::ignored, named_usage::ignored>>,
-    EnclosedStmts...>, Types>
-{
-
-};
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
+    : HipStatementExecutor<
+          Data,
+          statement::TileTCount<
+              ArgumentId,
+              ParamId,
+              TPol,
+              RAJA::policy::hip::hip_indexer<
+                  iteration_mapping::StridedLoop<named_usage::unspecified>,
+                  kernel_sync_requirement::none,
+                  hip::IndexGlobal<named_dim::x,
+                                   named_usage::ignored,
+                                   named_usage::ignored>>,
+              EnclosedStmts...>,
+          Types>
+{};
 
 }  // end namespace internal
 }  // end namespace RAJA
diff --git a/include/RAJA/policy/hip/kernel/internal.hpp b/include/RAJA/policy/hip/kernel/internal.hpp
index aa0610d736..82b8ed1bf1 100644
--- a/include/RAJA/policy/hip/kernel/internal.hpp
+++ b/include/RAJA/policy/hip/kernel/internal.hpp
@@ -37,36 +37,33 @@
 #include "RAJA/policy/hip/MemUtils_HIP.hpp"
 #include "RAJA/policy/hip/policy.hpp"
 
-
 namespace RAJA
 {
 
 namespace internal
 {
 
-struct LaunchDims {
+struct LaunchDims
+{
 
   HipDims dims;
   HipDims min_dims;
 
-  LaunchDims() = default;
-  LaunchDims(LaunchDims const&) = default;
+  LaunchDims()                             = default;
+  LaunchDims(LaunchDims const&)            = default;
   LaunchDims& operator=(LaunchDims const&) = default;
 
   RAJA_INLINE
-  LaunchDims(HipDims _dims)
-    : dims{_dims}
-    , min_dims{}
-  { }
+  LaunchDims(HipDims _dims) : dims {_dims}, min_dims {} {}
 
   RAJA_INLINE
   LaunchDims(HipDims _dims, HipDims _min_dims)
-    : dims{_dims}
-    , min_dims{_min_dims}
-  { }
+      : dims {_dims},
+        min_dims {_min_dims}
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -82,51 +79,50 @@ struct LaunchDims {
     result.dims.threads.y = std::max(c.dims.threads.y, dims.threads.y);
     result.dims.threads.z = std::max(c.dims.threads.z, dims.threads.z);
 
-    result.min_dims.threads.x = std::max(c.min_dims.threads.x, min_dims.threads.x);
-    result.min_dims.threads.y = std::max(c.min_dims.threads.y, min_dims.threads.y);
-    result.min_dims.threads.z = std::max(c.min_dims.threads.z, min_dims.threads.z);
+    result.min_dims.threads.x =
+        std::max(c.min_dims.threads.x, min_dims.threads.x);
+    result.min_dims.threads.y =
+        std::max(c.min_dims.threads.y, min_dims.threads.y);
+    result.min_dims.threads.z =
+        std::max(c.min_dims.threads.z, min_dims.threads.z);
 
     return result;
   }
 
   RAJA_INLINE
-  int num_blocks() const {
-    return dims.num_blocks();
-  }
+  int num_blocks() const { return dims.num_blocks(); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return dims.num_threads();
-  }
-
+  int num_threads() const { return dims.num_threads(); }
 
   RAJA_INLINE
-  void clamp_to_min_blocks() {
+  void clamp_to_min_blocks()
+  {
     dims.blocks.x = std::max(min_dims.blocks.x, dims.blocks.x);
     dims.blocks.y = std::max(min_dims.blocks.y, dims.blocks.y);
     dims.blocks.z = std::max(min_dims.blocks.z, dims.blocks.z);
   };
 
   RAJA_INLINE
-  void clamp_to_min_threads() {
+  void clamp_to_min_threads()
+  {
     dims.threads.x = std::max(min_dims.threads.x, dims.threads.x);
     dims.threads.y = std::max(min_dims.threads.y, dims.threads.y);
     dims.threads.z = std::max(min_dims.threads.z, dims.threads.z);
   };
-
 };
 
-
-template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper {
+template<camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
+struct HipStatementListExecutorHelper
+{
 
   using next_helper_t =
       HipStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
 
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, bool thread_active)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, thread_active);
@@ -135,9 +131,8 @@ struct HipStatementListExecutorHelper {
     next_helper_t::exec(data, thread_active);
   }
 
-
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -150,65 +145,57 @@ struct HipStatementListExecutorHelper {
   }
 };
 
-template <camp::idx_t num_stmts, typename StmtList>
-struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+template<camp::idx_t num_stmts, typename StmtList>
+struct HipStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, bool)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data&, bool)
   {
     // nop terminator
   }
 
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
 };
 
 
-template <typename Data, typename Policy, typename Types>
+template<typename Data, typename Policy, typename Types>
 struct HipStatementExecutor;
 
-template <typename Data, typename StmtList, typename Types>
+template<typename Data, typename StmtList, typename Types>
 struct HipStatementListExecutor;
 
-
-template <typename Data, typename... Stmts, typename Types>
-struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+template<typename Data, typename... Stmts, typename Types>
+struct HipStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<HipStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data, bool thread_active)
   {
     // Execute statements in order with helper class
-    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, thread_active);
+    HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, thread_active);
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return HipStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return HipStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
-
-template <typename StmtList, typename Data, typename Types>
-using hip_statement_list_executor_t = HipStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+template<typename StmtList, typename Data, typename Types>
+using hip_statement_list_executor_t =
+    HipStatementListExecutor<Data, StmtList, Types>;
 
 
 // specialization for direct sequential policies
@@ -217,268 +204,356 @@ struct KernelDimensionCalculator;
 
 // specialization for direct sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template<typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if ( len > static_cast<IdxT>(1) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(1))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
   }
 };
 
 // specialization for direct thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(len));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::block_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::block_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for direct block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(len));
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > static_cast<IdxT>(IndexMapper::grid_size) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > static_cast<IdxT>(IndexMapper::grid_size))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for direct global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT len)
+  template<typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       RAJA_ABORT_OR_THROW("must know one of block_size or grid_size");
     }
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
   }
 };
+
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::Direct,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::Direct,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if ( len > (static_cast<IdxT>(IndexMapper::block_size) *
-                static_cast<IdxT>(IndexMapper::grid_size)) ) {
-      RAJA_ABORT_OR_THROW("len exceeds the size of the directly mapped index space");
+    if (len > (static_cast<IdxT>(IndexMapper::block_size) *
+               static_cast<IdxT>(IndexMapper::grid_size)))
+    {
+      RAJA_ABORT_OR_THROW(
+          "len exceeds the size of the directly mapped index space");
     }
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
-
 // specialization for strided loop sequential policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
-
-  template < typename IdxT >
-  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims), HipDims& RAJA_UNUSED_ARG(min_dims), IdxT RAJA_UNUSED_ARG(len))
-  {
-  }
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::ignored>;
+
+  template<typename IdxT>
+  static void set_dimensions(HipDims& RAJA_UNUSED_ARG(dims),
+                             HipDims& RAJA_UNUSED_ARG(min_dims),
+                             IdxT RAJA_UNUSED_ARG(len))
+  {}
 };
 
 // specialization for strided loop thread policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
   }
 };
 
 // specialization for strided loop block policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(len));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
 // specialization for strided loop global policies
 template<named_dim dim, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>>>
 {
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    if (len > static_cast<IdxT>(0)) {
+    if (len > static_cast<IdxT>(0))
+    {
       set_hip_dim<dim>(dims.threads, static_cast<IdxT>(1));
       set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(1));
       set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
@@ -486,63 +561,93 @@ struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mappin
     }
   }
 };
+
 ///
 template<named_dim dim, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>>>
 {
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
-    // BEWARE: if calculated block_size is too high then the kernel launch will fail
-    set_hip_dim<dim>(dims.threads, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::grid_size)));
+    // BEWARE: if calculated block_size is too high then the kernel launch will
+    // fail
+    set_hip_dim<dim>(dims.threads,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::grid_size)));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
     set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(1));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
+
 ///
 template<named_dim dim, int BLOCK_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
-  using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
+  using IndexMapper =
+      hip::IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>;
 
-  template < typename IdxT >
+  template<typename IdxT>
   static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT len)
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(dims.blocks, RAJA_DIVIDE_CEILING_INT(len, static_cast<IdxT>(IndexMapper::block_size)));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(dims.blocks,
+                     RAJA_DIVIDE_CEILING_INT(
+                         len, static_cast<IdxT>(IndexMapper::block_size)));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(1));
   }
 };
+
 ///
-template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE, kernel_sync_requirement sync>
-struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                    sync,
-                                                    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
+template<named_dim dim,
+         int BLOCK_SIZE,
+         int GRID_SIZE,
+         kernel_sync_requirement sync>
+struct KernelDimensionCalculator<RAJA::policy::hip::hip_indexer<
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    sync,
+    hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>>
 {
-  static_assert(BLOCK_SIZE > 0, "block size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
-  static_assert(GRID_SIZE > 0, "grid size must be > 0, named_usage::unspecified, or named_usage::ignored with kernel");
+  static_assert(BLOCK_SIZE > 0,
+                "block size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
+  static_assert(GRID_SIZE > 0,
+                "grid size must be > 0, named_usage::unspecified, or "
+                "named_usage::ignored with kernel");
 
   using IndexMapper = hip::IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>;
 
-  template < typename IdxT >
-  static void set_dimensions(HipDims& dims, HipDims& min_dims, IdxT RAJA_UNUSED_ARG(len))
+  template<typename IdxT>
+  static void set_dimensions(HipDims& dims,
+                             HipDims& min_dims,
+                             IdxT RAJA_UNUSED_ARG(len))
   {
     set_hip_dim<dim>(dims.threads, static_cast<IdxT>(IndexMapper::block_size));
     set_hip_dim<dim>(dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
-    set_hip_dim<dim>(min_dims.threads, static_cast<IdxT>(IndexMapper::block_size));
-    set_hip_dim<dim>(min_dims.blocks, static_cast<IdxT>(IndexMapper::grid_size));
+    set_hip_dim<dim>(min_dims.threads,
+                     static_cast<IdxT>(IndexMapper::block_size));
+    set_hip_dim<dim>(min_dims.blocks,
+                     static_cast<IdxT>(IndexMapper::grid_size));
   }
 };
 
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
index 6823647b48..9b59eb1f3e 100644
--- a/include/RAJA/policy/hip/launch.hpp
+++ b/include/RAJA/policy/hip/launch.hpp
@@ -28,55 +28,62 @@
 namespace RAJA
 {
 
-template <typename BODY>
+template<typename BODY>
 __global__ void launch_global_fcn(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, typename ReduceParams>
-__global__ void launch_new_reduce_global_fcn(BODY body_in, ReduceParams reduce_params)
+template<typename BODY, typename ReduceParams>
+__global__ void launch_new_reduce_global_fcn(BODY body_in,
+                                             ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
-template <bool async>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>> {
+template<bool async>
+struct LaunchExecute<
+    RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>>
+{
 
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn<BODY>);
+    auto func = reinterpret_cast<const void*>(&launch_global_fcn<BODY>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -84,18 +91,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -105,14 +114,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -121,14 +132,18 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     return resources::EventProxy<resources::Resource>(res);
   }
 
-
- //Version with explicit reduction parameters..
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
@@ -141,45 +156,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -187,62 +210,66 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, named_usage::unspeci
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
-
-template <typename BODY, int num_threads>
+template<typename BODY, int num_threads>
 __launch_bounds__(num_threads, 1) __global__
-void launch_global_fcn_fixed(BODY body_in)
+    void launch_global_fcn_fixed(BODY body_in)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
   body(ctx);
 }
 
-template <typename BODY, int num_threads, typename ReduceParams>
+template<typename BODY, int num_threads, typename ReduceParams>
 __launch_bounds__(num_threads, 1) __global__
-void launch_new_reduce_global_fcn_fixed(BODY body_in, ReduceParams reduce_params)
+    void launch_new_reduce_global_fcn_fixed(BODY body_in,
+                                            ReduceParams reduce_params)
 {
   LaunchContext ctx;
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
-  auto& body = privatizer.get_priv();
+  auto& body      = privatizer.get_priv();
 
-  //Set pointer to shared memory
+  // Set pointer to shared memory
   extern __shared__ char raja_shmem_ptr[];
   ctx.shared_mem_ptr = raja_shmem_ptr;
 
-  RAJA::expt::invoke_body( reduce_params, body, ctx );
+  RAJA::expt::invoke_body(reduce_params, body, ctx);
 
-  //Using a flatten global policy as we may use all dimensions
-  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(reduce_params);
+  // Using a flatten global policy as we may use all dimensions
+  RAJA::expt::ParamMultiplexer::combine<RAJA::hip_flatten_global_xyz_direct>(
+      reduce_params);
 }
 
+template<bool async, int nthreads>
+struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
+{
 
-template <bool async, int nthreads>
-struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
-
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     using BODY = camp::decay<BODY_IN>;
 
-    auto func = reinterpret_cast<const void*>(
-        &launch_global_fcn_fixed<BODY, nthreads>);
+    auto func =
+        reinterpret_cast<const void*>(&launch_global_fcn_fixed<BODY, nthreads>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -250,18 +277,20 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(params.teams.value[0]),
+    hip_dim_t gridSize {static_cast<hip_dim_member_t>(params.teams.value[0]),
                         static_cast<hip_dim_member_t>(params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(params.teams.value[2]) };
+                        static_cast<hip_dim_member_t>(params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(params.threads.value[0]),
+        static_cast<hip_dim_member_t>(params.threads.value[1]),
+        static_cast<hip_dim_member_t>(params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
@@ -270,14 +299,16 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
       }
 
       RAJA_FT_END;
@@ -286,18 +317,24 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //Version with explicit reduction parameters..
-  template <typename BODY_IN, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params,
-       const char *kernel_name, BODY_IN &&body_in, ReduceParams &launch_reducers)
+  // Version with explicit reduction parameters..
+  template<typename BODY_IN, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& launch_reducers)
   {
     using BODY = camp::decay<BODY_IN>;
 
     auto func = reinterpret_cast<const void*>(
-        &launch_new_reduce_global_fcn_fixed<BODY, nthreads, camp::decay<ReduceParams>>);
+        &launch_new_reduce_global_fcn_fixed<BODY, nthreads,
+                                            camp::decay<ReduceParams>>);
 
     resources::Hip hip_res = res.get<RAJA::resources::Hip>();
 
@@ -305,45 +342,53 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
     // Compute the number of blocks and threads
     //
 
-    hip_dim_t gridSize{ static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
-                        static_cast<hip_dim_member_t>(launch_params.teams.value[2]) };
+    hip_dim_t gridSize {
+        static_cast<hip_dim_member_t>(launch_params.teams.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.teams.value[2])};
 
-    hip_dim_t blockSize{ static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
-                         static_cast<hip_dim_member_t>(launch_params.threads.value[2]) };
+    hip_dim_t blockSize {
+        static_cast<hip_dim_member_t>(launch_params.threads.value[0]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[1]),
+        static_cast<hip_dim_member_t>(launch_params.threads.value[2])};
 
     // Only launch kernel if we have something to iterate over
     constexpr hip_dim_member_t zero = 0;
-    if ( gridSize.x  > zero && gridSize.y  > zero && gridSize.z  > zero &&
-         blockSize.x > zero && blockSize.y > zero && blockSize.z > zero ) {
+    if (gridSize.x > zero && gridSize.y > zero && gridSize.z > zero &&
+        blockSize.x > zero && blockSize.y > zero && blockSize.z > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       size_t shared_mem_size = launch_params.shared_mem_size;
       RAJA::hip::detail::hipInfo launch_info;
-      launch_info.gridDim = gridSize;
-      launch_info.blockDim = blockSize;
+      launch_info.gridDim      = gridSize;
+      launch_info.blockDim     = blockSize;
       launch_info.dynamic_smem = &shared_mem_size;
-      launch_info.res = hip_res;
+      launch_info.res          = hip_res;
 
       {
-        using EXEC_POL = RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
-        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers, launch_info);
+        using EXEC_POL =
+            RAJA::policy::hip::hip_launch_t<async, named_usage::unspecified>;
+        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(launch_reducers,
+                                                     launch_info);
 
         //
         // Privatize the loop_body, using make_launch_body to setup reductions
         //
-        BODY body = RAJA::hip::make_launch_body(func,
-            gridSize, blockSize, shared_mem_size, hip_res, std::forward<BODY_IN>(body_in));
+        BODY body = RAJA::hip::make_launch_body(func, gridSize, blockSize,
+                                                shared_mem_size, hip_res,
+                                                std::forward<BODY_IN>(body_in));
 
         //
         // Launch the kernel
         //
-        void *args[] = {(void*)&body, (void*)&launch_reducers};
-        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size, hip_res, async, kernel_name);
+        void* args[] = {(void*)&body, (void*)&launch_reducers};
+        RAJA::hip::launch(func, gridSize, blockSize, args, shared_mem_size,
+                          hip_res, async, kernel_name);
 
-        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers, launch_info);
+        RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers,
+                                                        launch_info);
       }
 
       RAJA_FT_END;
@@ -351,51 +396,56 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
-
 /*
    HIP generic loop implementations
 */
-template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
-
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+template<typename SEGMENT, typename IndexMapper>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
+
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -403,29 +453,36 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
+    if (i0 < len0 && i1 < len1)
+    {
       body(*(segment0.begin() + i0), *(segment1.begin() + i1));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -435,53 +492,62 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direc
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
            *(segment2.begin() + i2));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE
-  void exec(LaunchContext const RAJA_UNUSED_ARG(&ctx),
-            SEGMENT const &segment,
-            BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(
+      LaunchContext const RAJA_UNUSED_ARG(&ctx),
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -492,34 +558,42 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1));
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1));
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper0,
-                                                  IndexMapper1,
-                                                  IndexMapper2>,
-                   SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -533,14 +607,16 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
                *(segment2.begin() + i2));
         }
       }
@@ -548,43 +624,51 @@ struct LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Strid
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>();
+    const diff_t i   = IndexMapper::template index<diff_t>();
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1>,
+    SEGMENT>
+{
 
-  template <typename BODY>
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
+
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -592,31 +676,36 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0 = IndexMapper0::template index<diff_t>();
     const diff_t i1 = IndexMapper1::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           i0, i1);
+    if (i0 < len0 && i1 < len1)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper0,
+                                   IndexMapper1,
+                                   IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -626,54 +715,62 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1 = IndexMapper1::template index<diff_t>();
     const diff_t i2 = IndexMapper2::template index<diff_t>();
 
-    if (i0 < len0 && i1 < len1 && i2 < len2) {
-      body(*(segment0.begin() + i0),
-           *(segment1.begin() + i1),
-           *(segment2.begin() + i2),
-           i0, i1, i2);
+    if (i0 < len0 && i1 < len1 && i2 < len2)
+    {
+      body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+           *(segment2.begin() + i2), i0, i1, i2);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>();
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t i_init   = IndexMapper::template index<diff_t>();
     const diff_t i_stride = IndexMapper::template size<diff_t>();
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -684,35 +781,42 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        body(*(segment0.begin() + i0),
-             *(segment1.begin() + i1),
-             i0, i1);
+        body(*(segment0.begin() + i0), *(segment1.begin() + i1), i0, i1);
       }
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper0,
-                                                        IndexMapper1,
-                                                        IndexMapper2>,
-                         SEGMENT> {
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopICountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
     const int len0 = segment0.end() - segment0.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -726,51 +830,54 @@ struct LoopICountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping:
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
     const diff_t i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride) {
+    for (diff_t i0 = i0_init; i0 < len0; i0 += i0_stride)
+    {
 
-      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride) {
+      for (diff_t i1 = i1_init; i1 < len1; i1 += i1_stride)
+      {
 
-        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride) {
+        for (diff_t i2 = i2_init; i2 < len2; i2 += i2_stride)
+        {
 
-          body(*(segment0.begin() + i0),
-               *(segment1.begin() + i1),
-               *(segment2.begin() + i2),
-               i0, i1, i2);
+          body(*(segment0.begin() + i0), *(segment1.begin() + i1),
+               *(segment2.begin() + i2), i0, i1, i2);
         }
       }
     }
   }
 };
 
-
 /*
    HIP generic flattened loop implementations
 */
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           sync,
+                                           IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::
+              hip_indexer<RAJA::iteration_mapping::Direct, sync, IndexMapper0>,
+          SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -779,29 +886,35 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
 
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
 
-    const int i = i0 + i0_stride*i1;
+    const int i = i0 + i0_stride * i1;
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::Direct,
+                                           kernel_sync_requirement::none,
+                                           IndexMapper0,
+                                           IndexMapper1,
+                                           IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -812,39 +925,47 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const diff_t i0_stride = IndexMapper0::template size<diff_t>();
     const diff_t i1_stride = IndexMapper1::template size<diff_t>();
 
-    const int i = i0 + i0_stride*(i1 + i1_stride*i2);
+    const int i = i0 + i0_stride * (i1 + i1_stride * i2);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
 template<typename SEGMENT, kernel_sync_requirement sync, typename IndexMapper0>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          sync,
-                                                          IndexMapper0>,
-                   SEGMENT>
-    :  LoopExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  sync,
-                                                  IndexMapper0>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        sync,
+        IndexMapper0>,
+    SEGMENT>
+    : LoopExecute<
+          RAJA::policy::hip::hip_indexer<
+              RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+              sync,
+              IndexMapper0>,
+          SEGMENT>
 {};
 
 template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1>,
-                   SEGMENT>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -854,29 +975,34 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i0_stride = IndexMapper0::template size<diff_t>();
     const int i1_stride = IndexMapper1::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*i1;
-         i < len;
-         i += i0_stride*i1_stride) {
+    for (int i = i0 + i0_stride * i1; i < len; i += i0_stride * i1_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template<typename SEGMENT, typename IndexMapper0, typename IndexMapper1, typename IndexMapper2>
-struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                          kernel_sync_requirement::none,
-                                                          IndexMapper0,
-                                                          IndexMapper1,
-                                                          IndexMapper2>,
-                   SEGMENT>
+template<typename SEGMENT,
+         typename IndexMapper0,
+         typename IndexMapper1,
+         typename IndexMapper2>
+struct LoopExecute<
+    RAJA::policy::hip::hip_flatten_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper0,
+        IndexMapper1,
+        IndexMapper2>,
+    SEGMENT>
 {
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
   template<typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -888,114 +1014,134 @@ struct LoopExecute<RAJA::policy::hip::hip_flatten_indexer<RAJA::iteration_mappin
     const int i1_stride = IndexMapper1::template size<diff_t>();
     const int i2_stride = IndexMapper2::template size<diff_t>();
 
-    for (int i = i0 + i0_stride*(i1 + i1_stride*i2);
-         i < len;
-         i += i0_stride*i1_stride*i2_stride) {
+    for (int i = i0 + i0_stride * (i1 + i1_stride * i2); i < len;
+         i += i0_stride * i1_stride * i2_stride)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-
 /*
    HIP generic tile implementations
 */
-template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                  kernel_sync_requirement::none,
-                                                  IndexMapper>,
-                   SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t i_init = IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
-    const diff_t i_stride = IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_init =
+        IndexMapper::template index<diff_t>() * static_cast<diff_t>(tile_size);
+    const diff_t i_stride =
+        IndexMapper::template size<diff_t>() * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)));
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::Direct,
+                                   kernel_sync_requirement::none,
+                                   IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const diff_t len = segment.end() - segment.begin();
-    const diff_t t = IndexMapper::template index<diff_t>();
-    const diff_t i = t * static_cast<diff_t>(tile_size);
+    const diff_t t   = IndexMapper::template index<diff_t>();
+    const diff_t i   = t * static_cast<diff_t>(tile_size);
 
-    if (i < len) {
+    if (i < len)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
 };
 
-template <typename SEGMENT, typename IndexMapper>
-struct TileTCountExecute<RAJA::policy::hip::hip_indexer<RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
-                                                        kernel_sync_requirement::none,
-                                                        IndexMapper>,
-                         SEGMENT> {
+template<typename SEGMENT, typename IndexMapper>
+struct TileTCountExecute<
+    RAJA::policy::hip::hip_indexer<
+        RAJA::iteration_mapping::StridedLoop<named_usage::unspecified>,
+        kernel_sync_requirement::none,
+        IndexMapper>,
+    SEGMENT>
+{
 
-  using diff_t = typename std::iterator_traits<typename SEGMENT::iterator>::difference_type;
+  using diff_t = typename std::iterator_traits<
+      typename SEGMENT::iterator>::difference_type;
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_INLINE RAJA_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
-    const diff_t len = segment.end() - segment.begin();
-    const diff_t t_init = IndexMapper::template index<diff_t>();
-    const diff_t i_init = t_init * static_cast<diff_t>(tile_size);
+    const diff_t len      = segment.end() - segment.begin();
+    const diff_t t_init   = IndexMapper::template index<diff_t>();
+    const diff_t i_init   = t_init * static_cast<diff_t>(tile_size);
     const diff_t t_stride = IndexMapper::template size<diff_t>();
     const diff_t i_stride = t_stride * static_cast<diff_t>(tile_size);
 
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
       body(segment.slice(i, static_cast<diff_t>(tile_size)), t);
     }
   }
diff --git a/include/RAJA/policy/hip/multi_reduce.hpp b/include/RAJA/policy/hip/multi_reduce.hpp
index 0d9d3899d8..b2b1507a95 100644
--- a/include/RAJA/policy/hip/multi_reduce.hpp
+++ b/include/RAJA/policy/hip/multi_reduce.hpp
@@ -46,9 +46,9 @@
 #include "RAJA/policy/hip/intrinsics.hpp"
 
 #if defined(RAJA_ENABLE_DESUL_ATOMICS)
-  #include "RAJA/policy/desul/atomic.hpp"
+#include "RAJA/policy/desul/atomic.hpp"
 #else
-  #include "RAJA/policy/hip/atomic.hpp"
+#include "RAJA/policy/hip/atomic.hpp"
 #endif
 
 #include "RAJA/policy/hip/policy.hpp"
@@ -73,100 +73,123 @@ namespace impl
 //
 
 //! combine value into global memory
-template <typename Combiner, typename GetTallyIndex,
-          typename T, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(int RAJA_UNUSED_ARG(num_bins),
-                                                                      T identity,
-                                                                      int bin,
-                                                                      T value,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template<typename Combiner,
+         typename GetTallyIndex,
+         typename T,
+         typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_global_atomic(
+    int RAJA_UNUSED_ARG(num_bins),
+    T identity,
+    int bin,
+    T value,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int tally_index = GetTallyIndex::template index<int>(); // globalWarpId by default
+  int tally_index =
+      GetTallyIndex::template index<int>();  // globalWarpId by default
   int tally_rep = ::RAJA::power_of_2_mod(tally_index, tally_replication);
-  int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-  RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+  int tally_offset =
+      get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+  RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
 }
 
-
 //! initialize shared memory
-template <typename T>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(int num_bins,
-                                                           T identity,
-                                                           T* shared_mem,
-                                                           int shared_replication)
+template<typename T>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_init_shmem(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    int shared_replication)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   for (int shmem_offset = threadId;
-       shmem_offset < shared_replication * num_bins;
-       shmem_offset += numThreads) {
+       shmem_offset < shared_replication * num_bins; shmem_offset += numThreads)
+  {
     shared_mem[shmem_offset] = identity;
   }
   __syncthreads();
 }
 
 //! combine value into shared memory
-template <typename Combiner, typename GetSharedIndex,
-          typename T, typename GetSharedOffset>
-RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(int num_bins,
-                                                                     T identity,
-                                                                     int bin,
-                                                                     T value,
-                                                                     T* shared_mem,
-                                                                     GetSharedOffset get_shared_offset,
-                                                                     int shared_replication)
+template<typename Combiner,
+         typename GetSharedIndex,
+         typename T,
+         typename GetSharedOffset>
+RAJA_DEVICE RAJA_INLINE void block_multi_reduce_combine_shmem_atomic(
+    int num_bins,
+    T identity,
+    int bin,
+    T value,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication)
 {
-  if (value == identity) { return; }
+  if (value == identity)
+  {
+    return;
+  }
 
-  int shared_index = GetSharedIndex::template index<int>(); // threadId by default
+  int shared_index =
+      GetSharedIndex::template index<int>();  // threadId by default
   int shared_rep = ::RAJA::power_of_2_mod(shared_index, shared_replication);
-  int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+  int shmem_offset =
+      get_shared_offset(bin, num_bins, shared_rep, shared_replication);
 
-  RAJA::reduce::hip::atomic<Combiner>{}(shared_mem[shmem_offset], value);
+  RAJA::reduce::hip::atomic<Combiner> {}(shared_mem[shmem_offset], value);
 }
 
 //! combine value into shared memory
-template <typename Combiner,
-          typename T, typename GetSharedOffset, typename GetTallyOffset>
-RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bins,
-                                                                      T identity,
-                                                                      T* shared_mem,
-                                                                      GetSharedOffset get_shared_offset,
-                                                                      int shared_replication,
-                                                                      T* tally_mem,
-                                                                      GetTallyOffset get_tally_offset,
-                                                                      int tally_replication,
-                                                                      int tally_bins)
+template<typename Combiner,
+         typename T,
+         typename GetSharedOffset,
+         typename GetTallyOffset>
+RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(
+    int num_bins,
+    T identity,
+    T* shared_mem,
+    GetSharedOffset get_shared_offset,
+    int shared_replication,
+    T* tally_mem,
+    GetTallyOffset get_tally_offset,
+    int tally_replication,
+    int tally_bins)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
   int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
-                 (gridDim.x * gridDim.y) * blockIdx.z;
+                (gridDim.x * gridDim.y) * blockIdx.z;
 
   __syncthreads();
-  for (int bin = threadId; bin < num_bins; bin += numThreads) {
+  for (int bin = threadId; bin < num_bins; bin += numThreads)
+  {
 
     T value = identity;
-    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep) {
-      int shmem_offset = get_shared_offset(bin, num_bins, shared_rep, shared_replication);
-      Combiner{}(value, shared_mem[shmem_offset]);
+    for (int shared_rep = 0; shared_rep < shared_replication; ++shared_rep)
+    {
+      int shmem_offset =
+          get_shared_offset(bin, num_bins, shared_rep, shared_replication);
+      Combiner {}(value, shared_mem[shmem_offset]);
     }
 
-    if (value != identity) {
+    if (value != identity)
+    {
       int tally_rep = ::RAJA::power_of_2_mod(blockId, tally_replication);
-      int tally_offset = get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
-      RAJA::reduce::hip::atomic<Combiner>{}(tally_mem[tally_offset], value);
+      int tally_offset =
+          get_tally_offset(bin, tally_bins, tally_rep, tally_replication);
+      RAJA::reduce::hip::atomic<Combiner> {}(tally_mem[tally_offset], value);
     }
-
   }
 }
 
@@ -181,52 +204,66 @@ RAJA_DEVICE RAJA_INLINE void grid_multi_reduce_shmem_to_global_atomic(int num_bi
 //
 
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_TallyData
 {
   //! setup permanent settings, allocate and initialize tally memory
-  template < typename Container >
-  MultiReduceGridAtomicHostInit_TallyData(Container const& container, T const& identity)
-      : m_tally_mem(nullptr)
-      , m_identity(identity)
-      , m_num_bins(container.size())
-      , m_tally_bins(get_tally_bins(m_num_bins))
-      , m_tally_replication(get_tally_replication())
+  template<typename Container>
+  MultiReduceGridAtomicHostInit_TallyData(Container const& container,
+                                          T const& identity)
+      : m_tally_mem(nullptr),
+        m_identity(identity),
+        m_num_bins(container.size()),
+        m_tally_bins(get_tally_bins(m_num_bins)),
+        m_tally_replication(get_tally_replication())
   {
-    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
+    m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                               m_tally_replication);
   }
 
   MultiReduceGridAtomicHostInit_TallyData() = delete;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData const&) = default;
-  MultiReduceGridAtomicHostInit_TallyData& operator=(MultiReduceGridAtomicHostInit_TallyData &&) = delete;
-  ~MultiReduceGridAtomicHostInit_TallyData() = default;
-
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData const&) = default;
+  MultiReduceGridAtomicHostInit_TallyData& operator=(
+      MultiReduceGridAtomicHostInit_TallyData&&) = delete;
+  ~MultiReduceGridAtomicHostInit_TallyData()     = default;
 
   //! reset permanent settings, reallocate and reset tally memory
-  template < typename Container >
+  template<typename Container>
   void reset_permanent(Container const& container, T const& identity)
   {
     int new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       teardown_permanent();
-      m_num_bins = new_num_bins;
-      m_tally_bins = get_tally_bins(m_num_bins);
+      m_num_bins          = new_num_bins;
+      m_tally_bins        = get_tally_bins(m_num_bins);
       m_tally_replication = get_tally_replication();
-      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins, m_tally_replication);
-    } else {
+      m_tally_mem = create_tally(container, identity, m_num_bins, m_tally_bins,
+                                 m_tally_replication);
+    }
+    else
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = value;
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = value;
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < m_num_bins; ++bin) {
-          m_tally_mem[GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication)] = identity;
+      for (int tally_rep = 1; tally_rep < m_tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < m_num_bins; ++bin)
+        {
+          m_tally_mem[GetTallyOffset {}(bin, m_tally_bins, tally_rep,
+                                        m_tally_replication)] = identity;
         }
       }
     }
@@ -239,39 +276,45 @@ struct MultiReduceGridAtomicHostInit_TallyData
     destroy_tally(m_tally_mem, m_num_bins, m_tally_bins, m_tally_replication);
   }
 
-
   //! get value for bin, assumes synchronization occurred elsewhere
   T get(int bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
-          reducer(m_identity);
-    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep) {
-      int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
+        reducer(m_identity);
+    for (int tally_rep = 0; tally_rep < m_tally_replication; ++tally_rep)
+    {
+      int tally_offset =
+          GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
       reducer.combine(m_tally_mem[tally_offset]);
     }
     return reducer.get_and_clear();
   }
 
-
   int num_bins() const { return m_num_bins; }
 
   T identity() const { return m_identity; }
 
 private:
-  static constexpr size_t s_tally_alignment = std::max(size_t(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
-                                                       size_t(RAJA::DATA_ALIGN));
-  static constexpr size_t s_tally_bunch_size = RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
+  static constexpr size_t s_tally_alignment = std::max(
+      size_t(
+          policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE),
+      size_t(RAJA::DATA_ALIGN));
+  static constexpr size_t s_tally_bunch_size =
+      RAJA_DIVIDE_CEILING_INT(s_tally_alignment, sizeof(T));
 
   using tally_mempool_type = device_pinned_mempool_type;
-  using tally_tuning = typename tuning::GlobalAtomicReplicationTuning;
-  using TallyAtomicReplicationConcretizer = typename tally_tuning::AtomicReplicationConcretizer;
+  using tally_tuning       = typename tuning::GlobalAtomicReplicationTuning;
+  using TallyAtomicReplicationConcretizer =
+      typename tally_tuning::AtomicReplicationConcretizer;
   using GetTallyOffset_rebind_rebunch = typename tally_tuning::OffsetCalculator;
-  using GetTallyOffset_rebind = typename GetTallyOffset_rebind_rebunch::template rebunch<s_tally_bunch_size>;
-
+  using GetTallyOffset_rebind =
+      typename GetTallyOffset_rebind_rebunch::template rebunch<
+          s_tally_bunch_size>;
 
   static int get_tally_bins(int num_bins)
   {
-    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) * s_tally_bunch_size;
+    return RAJA_DIVIDE_CEILING_INT(num_bins, s_tally_bunch_size) *
+           s_tally_bunch_size;
   }
 
   static int get_tally_replication()
@@ -281,39 +324,50 @@ struct MultiReduceGridAtomicHostInit_TallyData
     min_tally_replication = omp_get_max_threads();
 #endif
 
-    struct {
+    struct
+    {
       int func_min_global_replication;
-    } func_data{min_tally_replication};
+    } func_data {min_tally_replication};
 
-    return TallyAtomicReplicationConcretizer{}.template
-        get_global_replication<int>(func_data);
+    return TallyAtomicReplicationConcretizer {}
+        .template get_global_replication<int>(func_data);
   }
 
-  template < typename Container >
-  static T* create_tally(Container const& container, T const& identity,
-                         int num_bins, int tally_bins, int tally_replication)
+  template<typename Container>
+  static T* create_tally(Container const& container,
+                         T const& identity,
+                         int num_bins,
+                         int tally_bins,
+                         int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
     T* tally_mem = tally_mempool_type::getInstance().template malloc<T>(
-        tally_replication*tally_bins, s_tally_alignment);
+        tally_replication * tally_bins, s_tally_alignment);
 
-    if (tally_replication > 0) {
+    if (tally_replication > 0)
+    {
       {
         int tally_rep = 0;
-        int bin = 0;
-        for (auto const& value : container) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(value);
+        int bin       = 0;
+        for (auto const& value : container)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(value);
           ++bin;
         }
       }
-      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep) {
-        for (int bin = 0; bin < num_bins; ++bin) {
-          int tally_offset = GetTallyOffset{}(bin, tally_bins, tally_rep, tally_replication);
-          new(&tally_mem[tally_offset]) T(identity);
+      for (int tally_rep = 1; tally_rep < tally_replication; ++tally_rep)
+      {
+        for (int bin = 0; bin < num_bins; ++bin)
+        {
+          int tally_offset =
+              GetTallyOffset {}(bin, tally_bins, tally_rep, tally_replication);
+          new (&tally_mem[tally_offset]) T(identity);
         }
       }
     }
@@ -321,15 +375,21 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
   static void destroy_tally(T*& tally_mem,
-                            int num_bins, int tally_bins, int tally_replication)
+                            int num_bins,
+                            int tally_bins,
+                            int tally_replication)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (int tally_rep = tally_replication+1; tally_rep > 0; --tally_rep) {
-      for (int bin = num_bins; bin > 0; --bin) {
-        int tally_offset = GetTallyOffset{}(bin-1, tally_bins, tally_rep-1, tally_replication);
+    for (int tally_rep = tally_replication + 1; tally_rep > 0; --tally_rep)
+    {
+      for (int bin = num_bins; bin > 0; --bin)
+      {
+        int tally_offset = GetTallyOffset {}(bin - 1, tally_bins, tally_rep - 1,
+                                             tally_replication);
         tally_mem[tally_offset].~T();
       }
     }
@@ -338,60 +398,54 @@ struct MultiReduceGridAtomicHostInit_TallyData
   }
 
 protected:
-  using GetTallyIndex = typename tally_tuning::ReplicationIndexer;
+  using GetTallyIndex  = typename tally_tuning::ReplicationIndexer;
   using GetTallyOffset = typename GetTallyOffset_rebind::template rebind<int>;
 
   T* m_tally_mem;
   T m_identity;
   int m_num_bins;
   int m_tally_bins;
-  int m_tally_replication; // power of 2, at least the max number of omp threads
+  int m_tally_replication;  // power of 2, at least the max number of omp
+                            // threads
 };
 
-
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! defer to tally data for some functions
-  using TallyData::TallyData;
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::TallyData;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, do nothing
-  void setup_launch(size_t RAJA_UNUSED_ARG(block_size))
-  { }
+  void setup_launch(size_t RAJA_UNUSED_ARG(block_size)) {}
 
   //! teardown per launch, do nothing
-  void teardown_launch()
-  { }
-
+  void teardown_launch() {}
 
   //! setup on device, do nothing
   RAJA_DEVICE
-  void setup_device()
-  { }
+  void setup_device() {}
 
   //! finalize on device, do nothing
   RAJA_DEVICE
-  void finalize_device()
-  { }
-
+  void finalize_device() {}
 
   //! combine value on device, combine a value into the tally atomically
   RAJA_DEVICE
   void combine_device(int bin, T value)
   {
     impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-        m_num_bins, m_identity,
-        bin, value,
-        m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+        m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+        m_tally_replication, m_tally_bins);
   }
 
   //! combine value on host, combine a value into the tally
@@ -401,78 +455,89 @@ struct MultiReduceGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using typename TallyData::GetTallyIndex;
   using typename TallyData::GetTallyOffset;
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 };
 
-
 //! MultiReduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 struct MultiReduceBlockThenGridAtomicHostInit_Data
     : MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>
 {
-  using TallyData = MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
+  using TallyData =
+      MultiReduceGridAtomicHostInit_TallyData<Combiner, T, tuning>;
 
   //! setup permanent settings, defer to tally data
-  template < typename Container >
-  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container, T const& identity)
-      : TallyData(container, identity)
-      , m_shared_offset(s_shared_offset_unknown)
-      , m_shared_replication(0)
-  { }
+  template<typename Container>
+  MultiReduceBlockThenGridAtomicHostInit_Data(Container const& container,
+                                              T const& identity)
+      : TallyData(container, identity),
+        m_shared_offset(s_shared_offset_unknown),
+        m_shared_replication(0)
+  {}
 
   MultiReduceBlockThenGridAtomicHostInit_Data() = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
-  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(MultiReduceBlockThenGridAtomicHostInit_Data &&) = delete;
-  ~MultiReduceBlockThenGridAtomicHostInit_Data() = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data const&) = default;
+  MultiReduceBlockThenGridAtomicHostInit_Data& operator=(
+      MultiReduceBlockThenGridAtomicHostInit_Data&&) = delete;
+  ~MultiReduceBlockThenGridAtomicHostInit_Data()     = default;
 
 
   //! defer to tally data for some functions
-  using TallyData::reset_permanent;
-  using TallyData::teardown_permanent;
   using TallyData::get;
-  using TallyData::num_bins;
   using TallyData::identity;
+  using TallyData::num_bins;
+  using TallyData::reset_permanent;
+  using TallyData::teardown_permanent;
 
   //! setup per launch, setup shared memory parameters
   void setup_launch(size_t block_size)
   {
-    if (m_num_bins == size_t(0)) {
+    if (m_num_bins == size_t(0))
+    {
       m_shared_offset = s_shared_offset_invalid;
       return;
     }
 
     size_t shared_replication = 0;
-    const size_t shared_offset = allocateDynamicShmem<T>(
-        [&](size_t max_shmem_size) {
-
-      struct {
-        size_t func_threads_per_block;
-        size_t func_max_shared_replication_per_block;
-      } func_data{block_size, max_shmem_size / m_num_bins};
-
-      shared_replication = SharedAtomicReplicationConcretizer{}.template
-          get_shared_replication<size_t>(func_data);
-      return m_num_bins * shared_replication;
-    });
-
-    if (shared_offset != dynamic_smem_allocation_failure) {
+    const size_t shared_offset =
+        allocateDynamicShmem<T>([&](size_t max_shmem_size) {
+          struct
+          {
+            size_t func_threads_per_block;
+            size_t func_max_shared_replication_per_block;
+          } func_data {block_size, max_shmem_size / m_num_bins};
+
+          shared_replication =
+              SharedAtomicReplicationConcretizer {}
+                  .template get_shared_replication<size_t>(func_data);
+          return m_num_bins * shared_replication;
+        });
+
+    if (shared_offset != dynamic_smem_allocation_failure)
+    {
       m_shared_replication = static_cast<int>(shared_replication);
-      m_shared_offset = static_cast<int>(shared_offset);
-    } else {
+      m_shared_offset      = static_cast<int>(shared_offset);
+    }
+    else
+    {
       m_shared_offset = s_shared_offset_invalid;
     }
   }
@@ -481,19 +546,18 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void teardown_launch()
   {
     m_shared_replication = 0;
-    m_shared_offset = s_shared_offset_unknown;
+    m_shared_offset      = s_shared_offset_unknown;
   }
 
-
   //! setup on device, initialize shared memory
   RAJA_DEVICE
   void setup_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
-      impl::block_multi_reduce_init_shmem(
-          m_num_bins, m_identity,
-          shared_mem, m_shared_replication);
+    if (shared_mem != nullptr)
+    {
+      impl::block_multi_reduce_init_shmem(m_num_bins, m_identity, shared_mem,
+                                          m_shared_replication);
     }
   }
 
@@ -502,30 +566,31 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   void finalize_device()
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::grid_multi_reduce_shmem_to_global_atomic<Combiner>(
-          m_num_bins, m_identity,
-          shared_mem, GetSharedOffset{}, m_shared_replication,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, shared_mem, GetSharedOffset {},
+          m_shared_replication, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
-
   //! combine value on device, combine a value into shared memory
   RAJA_DEVICE
   void combine_device(int bin, T value)
   {
     T* shared_mem = get_shared_mem();
-    if (shared_mem != nullptr) {
+    if (shared_mem != nullptr)
+    {
       impl::block_multi_reduce_combine_shmem_atomic<Combiner, GetSharedIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          shared_mem, GetSharedOffset{}, m_shared_replication);
-    } else {
+          m_num_bins, m_identity, bin, value, shared_mem, GetSharedOffset {},
+          m_shared_replication);
+    }
+    else
+    {
       impl::block_multi_reduce_combine_global_atomic<Combiner, GetTallyIndex>(
-          m_num_bins, m_identity,
-          bin, value,
-          m_tally_mem, GetTallyOffset{}, m_tally_replication, m_tally_bins);
+          m_num_bins, m_identity, bin, value, m_tally_mem, GetTallyOffset {},
+          m_tally_replication, m_tally_bins);
     }
   }
 
@@ -536,14 +601,16 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
 #if defined(RAJA_ENABLE_OPENMP)
     tally_rep = omp_get_thread_num();
 #endif
-    int tally_offset = GetTallyOffset{}(bin, m_tally_bins, tally_rep, m_tally_replication);
-    Combiner{}(m_tally_mem[tally_offset], value);
+    int tally_offset =
+        GetTallyOffset {}(bin, m_tally_bins, tally_rep, m_tally_replication);
+    Combiner {}(m_tally_mem[tally_offset], value);
   }
 
 private:
   using shared_tuning = typename tuning::SharedAtomicReplicationTuning;
-  using SharedAtomicReplicationConcretizer = typename shared_tuning::AtomicReplicationConcretizer;
-  using GetSharedIndex = typename shared_tuning::ReplicationIndexer;
+  using SharedAtomicReplicationConcretizer =
+      typename shared_tuning::AtomicReplicationConcretizer;
+  using GetSharedIndex         = typename shared_tuning::ReplicationIndexer;
   using GetSharedOffset_rebind = typename shared_tuning::OffsetCalculator;
   using GetSharedOffset = typename GetSharedOffset_rebind::template rebind<int>;
 
@@ -551,24 +618,26 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   using typename TallyData::GetTallyOffset;
 
 
-  static constexpr int s_shared_offset_unknown = std::numeric_limits<int>::max();
-  static constexpr int s_shared_offset_invalid = std::numeric_limits<int>::max() - 1;
+  static constexpr int s_shared_offset_unknown =
+      std::numeric_limits<int>::max();
+  static constexpr int s_shared_offset_invalid =
+      std::numeric_limits<int>::max() - 1;
 
 
-  using TallyData::m_tally_mem;
   using TallyData::m_identity;
   using TallyData::m_num_bins;
   using TallyData::m_tally_bins;
+  using TallyData::m_tally_mem;
   using TallyData::m_tally_replication;
 
-  int m_shared_offset; // in bytes
-  int m_shared_replication; // power of 2
-
+  int m_shared_offset;       // in bytes
+  int m_shared_replication;  // power of 2
 
   RAJA_DEVICE
   T* get_shared_mem() const
   {
-    if (m_shared_offset == s_shared_offset_invalid) {
+    if (m_shared_offset == s_shared_offset_invalid)
+    {
       return nullptr;
     }
     extern __shared__ char shared_mem[];
@@ -576,7 +645,6 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
   }
 };
 
-
 /*!
  **************************************************************************
  *
@@ -595,39 +663,49 @@ struct MultiReduceBlockThenGridAtomicHostInit_Data
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template<typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataHip
 {
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type =
-      std::conditional_t<(atomic_available),
-        std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic),
-          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-          std::conditional_t<(tuning::algorithm == multi_reduce_algorithm::init_host_combine_global_atomic),
-            hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp, T, tuning>,
-            void>>,
+  using reduce_data_type = std::conditional_t<
+      (atomic_available),
+      std::conditional_t<
+          (tuning::algorithm ==
+           multi_reduce_algorithm::
+               init_host_combine_block_atomic_then_grid_atomic),
+          hip::MultiReduceBlockThenGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                           T,
+                                                           tuning>,
+          std::conditional_t<
+              (tuning::algorithm ==
+               multi_reduce_algorithm::init_host_combine_global_atomic),
+              hip::MultiReduceGridAtomicHostInit_Data<t_MultiReduceOp,
+                                                      T,
+                                                      tuning>,
+              void>>,
       void>;
 
 
   using SyncList = std::vector<resources::Hip>;
 
 public:
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataHip() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataHip>::value>* = nullptr >
+  template<typename Container,
+           std::enable_if_t<
+               !std::is_same<Container, MultiReduceDataHip>::value>* = nullptr>
   MultiReduceDataHip(Container const& container, T identity)
-      : m_parent(this)
-      , m_sync_list(new SyncList)
-      , m_data(container, identity)
-      , m_own_launch_data(false)
-  {
-  }
+      : m_parent(this),
+        m_sync_list(new SyncList),
+        m_data(container, identity),
+        m_own_launch_data(false)
+  {}
 
   //! copy and on host attempt to setup for device
   //  init val_ptr to avoid uninitialized read caused by host copy of
@@ -639,31 +717,35 @@ struct MultiReduceDataHip
 #else
       : m_parent(&other)
 #endif
-      , m_sync_list(other.m_sync_list)
-      , m_data(other.m_data)
-      , m_own_launch_data(false)
+        ,
+        m_sync_list(other.m_sync_list),
+        m_data(other.m_data),
+        m_own_launch_data(false)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent) {
-      if (setupReducers()) {
+    if (m_parent)
+    {
+      if (setupReducers())
+      {
         // the copy made in make_launch_body does this setup
         add_resource_to_synchronization_list(currentResource());
         m_data.setup_launch(currentBlockSize());
         m_own_launch_data = true;
-        m_parent = nullptr;
+        m_parent          = nullptr;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device enters this branch
       m_data.setup_device();
     }
 #endif
   }
 
-  MultiReduceDataHip(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip(MultiReduceDataHip&&)                 = delete;
   MultiReduceDataHip& operator=(MultiReduceDataHip const&) = delete;
-  MultiReduceDataHip& operator=(MultiReduceDataHip &&) = delete;
+  MultiReduceDataHip& operator=(MultiReduceDataHip&&)      = delete;
 
   //! cleanup resources owned by this copy
   //  on device store in pinned buffer on host
@@ -671,38 +753,43 @@ struct MultiReduceDataHip
   ~MultiReduceDataHip()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (m_parent == this) {
+    if (m_parent == this)
+    {
       // the original object, owns permanent storage
       synchronize_resources_and_clear_list();
       delete m_sync_list;
       m_sync_list = nullptr;
       m_data.teardown_permanent();
-    } else if (m_parent) {
+    }
+    else if (m_parent)
+    {
       // do nothing
-    } else {
-      if (m_own_launch_data) {
+    }
+    else
+    {
+      if (m_own_launch_data)
+      {
         // the copy made in make_launch_body, owns launch data
         m_data.teardown_launch();
         m_own_launch_data = false;
       }
     }
 #else
-    if (!m_parent->m_parent) {
+    if (!m_parent->m_parent)
+    {
       // the first copy on device, does finalization on the device
       m_data.finalize_device();
     }
 #endif
   }
 
-
-  template < typename Container >
+  template<typename Container>
   void reset(Container const& container, T identity)
   {
     synchronize_resources_and_clear_list();
     m_data.reset_permanent(container, identity);
   }
 
-
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
   void combine(int bin, T const& value)
@@ -714,7 +801,6 @@ struct MultiReduceDataHip
 #endif
   }
 
-
   //! map result value back to host if not done already; return aggregate value
   T get(int bin)
   {
@@ -722,22 +808,23 @@ struct MultiReduceDataHip
     return m_data.get(bin);
   }
 
-
   size_t num_bins() const { return m_data.num_bins(); }
 
   T identity() const { return m_data.identity(); }
 
 
 private:
-  MultiReduceDataHip const *m_parent;
+  MultiReduceDataHip const* m_parent;
   SyncList* m_sync_list;
   reduce_data_type m_data;
   bool m_own_launch_data;
 
   void add_resource_to_synchronization_list(resources::Hip res)
   {
-    for (resources::Hip& list_res : *m_sync_list) {
-      if (list_res.get_stream() == res.get_stream()) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
+      if (list_res.get_stream() == res.get_stream())
+      {
         return;
       }
     }
@@ -746,7 +833,8 @@ struct MultiReduceDataHip
 
   void synchronize_resources_and_clear_list()
   {
-    for (resources::Hip& list_res : *m_sync_list) {
+    for (resources::Hip& list_res : *m_sync_list)
+    {
       ::RAJA::hip::synchronize(list_res);
     }
     m_sync_list->clear();
@@ -755,7 +843,8 @@ struct MultiReduceDataHip
 
 }  // end namespace hip
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy, hip::MultiReduceDataHip)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::hip::hip_multi_reduce_policy,
+                                hip::MultiReduceDataHip)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/hip/params/kernel_name.hpp b/include/RAJA/policy/hip/params/kernel_name.hpp
index 30269f8406..13fac86478 100644
--- a/include/RAJA/policy/hip/params/kernel_name.hpp
+++ b/include/RAJA/policy/hip/params/kernel_name.hpp
@@ -11,42 +11,47 @@
 #include "roctx.h"
 #endif
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(KernelName& kn, const RAJA::hip::detail::hipInfo &)
-  {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>> init(
+    KernelName& kn,
+    const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePush(kn.name);
+  roctxRangePush(kn.name);
 #else
-    RAJA_UNUSED_VAR(kn);
+  RAJA_UNUSED_VAR(kn);
 #endif
-  }
-
-  // Combine
-  template<typename EXEC_POL>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(KernelName&) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(KernelName&, const RAJA::hip::detail::hipInfo &)
-  {
+}
+
+// Combine
+template<typename EXEC_POL>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(KernelName&)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>> resolve(
+    KernelName&,
+    const RAJA::hip::detail::hipInfo&)
+{
 #if defined(RAJA_ENABLE_ROCTX)
-    roctxRangePop();
+  roctxRangePop();
 #endif
-  }
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/params/reduce.hpp b/include/RAJA/policy/hip/params/reduce.hpp
index a3da07ee2c..d50e07a912 100644
--- a/include/RAJA/policy/hip/params/reduce.hpp
+++ b/include/RAJA/policy/hip/params/reduce.hpp
@@ -8,54 +8,58 @@
 #include "RAJA/policy/hip/reduce.hpp"
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    red.devicetarget = RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
-    red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
-    red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance().template malloc<unsigned int>(1);
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& red)
-  {
-    RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter,OP>( red.devicetarget,
-                                                                            red.getVal(),
-                                                                            red.device_mem,
-                                                                            red.device_count);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_hip_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red, RAJA::hip::detail::hipInfo& hi)
-  {
-    // complete reduction
-    hi.res.wait();
-
-    red.combineTarget(*red.devicetarget);
-
-    // free memory
-    RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
-    red.device_count = nullptr;
-    red.device_mem.deallocate();
-    RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
-    red.devicetarget = nullptr;
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>> init(
+    Reducer<OP, T, VOp>& red,
+    RAJA::hip::detail::hipInfo& hi)
+{
+  red.devicetarget =
+      RAJA::hip::pinned_mempool_type::getInstance().template malloc<T>(1);
+  red.device_mem.allocate(hi.gridDim.x * hi.gridDim.y * hi.gridDim.z);
+  red.device_count = RAJA::hip::device_zeroed_mempool_type::getInstance()
+                         .template malloc<unsigned int>(1);
+}
+
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+RAJA_HOST_DEVICE camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& red)
+{
+  RAJA::hip::impl::expt::grid_reduce<typename EXEC_POL::IterationGetter, OP>(
+      red.devicetarget, red.getVal(), red.device_mem, red.device_count);
+}
+
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_hip_policy<EXEC_POL>> resolve(
+    Reducer<OP, T, VOp>& red,
+    RAJA::hip::detail::hipInfo& hi)
+{
+  // complete reduction
+  hi.res.wait();
+
+  red.combineTarget(*red.devicetarget);
+
+  // free memory
+  RAJA::hip::device_zeroed_mempool_type::getInstance().free(red.device_count);
+  red.device_count = nullptr;
+  red.device_mem.deallocate();
+  RAJA::hip::pinned_mempool_type::getInstance().free(red.devicetarget);
+  red.devicetarget = nullptr;
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 #endif
 
-#endif //  NEW_REDUCE_HIP_REDUCE_HPP
+#endif  //  NEW_REDUCE_HIP_REDUCE_HPP
diff --git a/include/RAJA/policy/hip/policy.hpp b/include/RAJA/policy/hip/policy.hpp
index a9f9027675..0b63cbf6c0 100644
--- a/include/RAJA/policy/hip/policy.hpp
+++ b/include/RAJA/policy/hip/policy.hpp
@@ -38,7 +38,7 @@
 namespace RAJA
 {
 
-using hip_dim_t = dim3;
+using hip_dim_t        = dim3;
 using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 //
@@ -55,13 +55,15 @@ using hip_dim_member_t = camp::decay<decltype(std::declval<hip_dim_t>().x)>;
 
 namespace detail
 {
-template <bool Async>
-struct get_launch {
+template<bool Async>
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
-template <>
-struct get_launch<false> {
+template<>
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -73,7 +75,7 @@ namespace hip
 template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct IndexGlobal;
 
-template<typename ...indexers>
+template<typename... indexers>
 struct IndexFlatten;
 
 template<size_t divisor, typename index>
@@ -82,7 +84,6 @@ struct IndexDivide;
 template<size_t divisor, typename index>
 struct IndexModulo;
 
-
 /*!
  * Use the max occupancy of a kernel on the current device when launch
  * parameters are not fully determined.
@@ -91,13 +92,14 @@ struct IndexModulo;
  */
 struct MaxOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -110,26 +112,31 @@ struct MaxOccupancyConcretizer
  * maximum grid size:
  * (Fraction * kernel_max_blocks_per_sm + BLOCKS_PER_SM_OFFSET) * device_sm
  */
-template < typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
+template<typename t_Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
 struct FractionOffsetOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     using Fraction = typename t_Fraction::template rebind<IdxT>;
 
-    IdxT device_sm_per_device = data.device_sm_per_device;
+    IdxT device_sm_per_device   = data.device_sm_per_device;
     IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
 
-    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0)) {
+    if (Fraction::multiply(func_max_blocks_per_sm) > IdxT(0))
+    {
       func_max_blocks_per_sm = Fraction::multiply(func_max_blocks_per_sm);
     }
 
-    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) > IdxT(0)) {
-      func_max_blocks_per_sm = IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
+    if (IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET) >
+        IdxT(0))
+    {
+      func_max_blocks_per_sm =
+          IdxT(std::ptrdiff_t(func_max_blocks_per_sm) + BLOCKS_PER_SM_OFFSET);
     }
 
-    IdxT func_max_blocks_per_device = func_max_blocks_per_sm * device_sm_per_device;
+    IdxT func_max_blocks_per_device =
+        func_max_blocks_per_sm * device_sm_per_device;
 
     return func_max_blocks_per_device;
   }
@@ -143,34 +150,38 @@ struct FractionOffsetOccupancyConcretizer
  * Otherwise use the given AvoidMaxOccupancyCalculator to determine the
  * maximum grid size.
  */
-template < typename AvoidMaxOccupancyConcretizer >
+template<typename AvoidMaxOccupancyConcretizer>
 struct AvoidDeviceMaxThreadOccupancyConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_max_grid_size(Data const& data)
   {
     IdxT device_max_threads_per_sm = data.device_max_threads_per_sm;
-    IdxT func_max_blocks_per_sm = data.func_max_blocks_per_sm;
-    IdxT func_threads_per_block = data.func_threads_per_block;
+    IdxT func_max_blocks_per_sm    = data.func_max_blocks_per_sm;
+    IdxT func_threads_per_block    = data.func_threads_per_block;
 
-    IdxT func_max_threads_per_sm = func_threads_per_block * func_max_blocks_per_sm;
+    IdxT func_max_threads_per_sm =
+        func_threads_per_block * func_max_blocks_per_sm;
 
-    if (func_max_threads_per_sm < device_max_threads_per_sm) {
+    if (func_max_threads_per_sm < device_max_threads_per_sm)
+    {
       return MaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
-    } else {
-      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(data);
+    }
+    else
+    {
+      return AvoidMaxOccupancyConcretizer::template get_max_grid_size<IdxT>(
+          data);
     }
   }
 };
 
-
 /*!
  * Get an amount of replication that is preferred_replication.
  */
-template < size_t preferred_replication >
+template<size_t preferred_replication>
 struct ConstantPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& RAJA_UNUSED_ARG(data))
   {
     return IdxT(preferred_replication);
@@ -182,19 +193,23 @@ struct ConstantPreferredReplicationConcretizer
  * data.func_threads_per_block is less than t_cutoff or
  * preferred_replication_after_cutoff otherwise.
  */
-template < size_t t_cutoff, size_t preferred_replication_before_cutoff,
-                            size_t preferred_replication_after_cutoff >
+template<size_t t_cutoff,
+         size_t preferred_replication_before_cutoff,
+         size_t preferred_replication_after_cutoff>
 struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_preferred_replication(Data const& data)
   {
-    IdxT cutoff = t_cutoff;
+    IdxT cutoff                 = t_cutoff;
     IdxT func_threads_per_block = data.func_threads_per_block;
 
-    if (func_threads_per_block < cutoff) {
+    if (func_threads_per_block < cutoff)
+    {
       return IdxT(preferred_replication_before_cutoff);
-    } else {
+    }
+    else
+    {
       return IdxT(preferred_replication_after_cutoff);
     }
   }
@@ -205,19 +220,21 @@ struct ThreadsPerBlockCutoffPreferredReplicationConcretizer
  * most the amount given by data.func_max_shared_replication_per_block or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template<typename GetPreferredReplication>
 struct SharedAtomicReplicationMaxPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_shared_replication(Data const& data)
   {
-    IdxT func_max_shared_replication_per_block = data.func_max_shared_replication_per_block;
+    IdxT func_max_shared_replication_per_block =
+        data.func_max_shared_replication_per_block;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return prev_pow2(std::min(preferred_replication,
-                              func_max_shared_replication_per_block));
+    return prev_pow2(
+        std::min(preferred_replication, func_max_shared_replication_per_block));
   }
 };
 
@@ -226,18 +243,20 @@ struct SharedAtomicReplicationMaxPow2Concretizer
  * least the amount given by data.func_min_global_replication or the
  * amount given by GetPreferredReplication.
  */
-template < typename GetPreferredReplication >
+template<typename GetPreferredReplication>
 struct GlobalAtomicReplicationMinPow2Concretizer
 {
-  template < typename IdxT, typename Data >
+  template<typename IdxT, typename Data>
   static IdxT get_global_replication(Data const& data)
   {
     IdxT func_min_global_replication = data.func_min_global_replication;
 
-    IdxT preferred_replication = GetPreferredReplication{}.template
-        get_preferred_replication<IdxT>(data);
+    IdxT preferred_replication =
+        GetPreferredReplication {}.template get_preferred_replication<IdxT>(
+            data);
 
-    return next_pow2(std::max(preferred_replication, func_min_global_replication));
+    return next_pow2(
+        std::max(preferred_replication, func_min_global_replication));
   }
 };
 
@@ -255,14 +274,16 @@ enum struct block_communication_mode : int
   block_fence
 };
 
-template < reduce_algorithm t_algorithm, block_communication_mode t_comm_mode,
-           size_t t_replication, size_t t_atomic_stride >
+template<reduce_algorithm t_algorithm,
+         block_communication_mode t_comm_mode,
+         size_t t_replication,
+         size_t t_atomic_stride>
 struct ReduceTuning
 {
-  static constexpr reduce_algorithm algorithm = t_algorithm;
+  static constexpr reduce_algorithm algorithm         = t_algorithm;
   static constexpr block_communication_mode comm_mode = t_comm_mode;
-  static constexpr size_t replication = t_replication;
-  static constexpr size_t atomic_stride = t_atomic_stride;
+  static constexpr size_t replication                 = t_replication;
+  static constexpr size_t atomic_stride               = t_atomic_stride;
   static constexpr bool consistent =
       (algorithm == reduce_algorithm::combine_last_block);
 };
@@ -274,25 +295,25 @@ enum struct multi_reduce_algorithm : int
   init_host_combine_global_atomic
 };
 
-template < typename t_AtomicReplicationConcretizer,
-           typename t_ReplicationIndexer,
-           typename t_OffsetCalculator >
+template<typename t_AtomicReplicationConcretizer,
+         typename t_ReplicationIndexer,
+         typename t_OffsetCalculator>
 struct AtomicReplicationTuning
 {
   using AtomicReplicationConcretizer = t_AtomicReplicationConcretizer;
-  using ReplicationIndexer = t_ReplicationIndexer;
-  using OffsetCalculator = t_OffsetCalculator;
+  using ReplicationIndexer           = t_ReplicationIndexer;
+  using OffsetCalculator             = t_OffsetCalculator;
 };
 
-template < multi_reduce_algorithm t_algorithm,
-           typename t_SharedAtomicReplicationTuning,
-           typename t_GlobalAtomicReplicationTuning >
+template<multi_reduce_algorithm t_algorithm,
+         typename t_SharedAtomicReplicationTuning,
+         typename t_GlobalAtomicReplicationTuning>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
   using SharedAtomicReplicationTuning = t_SharedAtomicReplicationTuning;
   using GlobalAtomicReplicationTuning = t_GlobalAtomicReplicationTuning;
-  static constexpr bool consistent = false;
+  static constexpr bool consistent    = false;
 };
 
 }  // namespace hip
@@ -307,16 +328,19 @@ struct DeviceConstants
   RAJA::Index_type WARP_SIZE;
   RAJA::Index_type MAX_BLOCK_SIZE;
   RAJA::Index_type MAX_WARPS;
-  RAJA::Index_type ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE; // basically the cache line size of the cache level that handles atomics
+  RAJA::Index_type
+      ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE;  // basically the cache line size of
+                                             // the cache level that handles
+                                             // atomics
 
   constexpr DeviceConstants(RAJA::Index_type warp_size,
                             RAJA::Index_type max_block_size,
                             RAJA::Index_type atomic_cache_line_bytes) noexcept
-    : WARP_SIZE(warp_size)
-    , MAX_BLOCK_SIZE(max_block_size)
-    , MAX_WARPS(max_block_size / warp_size)
-    , ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
-  { }
+      : WARP_SIZE(warp_size),
+        MAX_BLOCK_SIZE(max_block_size),
+        MAX_WARPS(max_block_size / warp_size),
+        ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE(atomic_cache_line_bytes)
+  {}
 };
 
 //
@@ -324,50 +348,58 @@ struct DeviceConstants
 // values for HIP warp size and max block size.
 //
 #if defined(__HIP_PLATFORM_AMD__)
-constexpr DeviceConstants device_constants(64, 1024, 64); // MI300A
+constexpr DeviceConstants device_constants(64, 1024, 64);  // MI300A
 // constexpr DeviceConstants device_constants(64, 1024, 128); // MI250X
 #elif defined(__HIP_PLATFORM_NVIDIA__)
-constexpr DeviceConstants device_constants(32, 1024, 32); // V100
+constexpr DeviceConstants device_constants(32, 1024, 32);  // V100
 #endif
 static_assert(device_constants.WARP_SIZE >= device_constants.MAX_WARPS,
-              "RAJA Assumption Broken: device_constants.WARP_SIZE < device_constants.MAX_WARPS");
+              "RAJA Assumption Broken: device_constants.WARP_SIZE < "
+              "device_constants.MAX_WARPS");
 static_assert(device_constants.MAX_BLOCK_SIZE % device_constants.WARP_SIZE == 0,
               "RAJA Assumption Broken: device_constants.MAX_BLOCK_SIZE not "
               "a multiple of device_constants.WARP_SIZE");
 
+template<typename _IterationMapping,
+         kernel_sync_requirement sync,
+         typename... _IterationGetters>
+struct hip_indexer
+{};
 
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
-struct hip_indexer {};
-
-template <typename _IterationMapping, kernel_sync_requirement sync, typename ... _IterationGetters>
+template<typename _IterationMapping,
+         kernel_sync_requirement sync,
+         typename... _IterationGetters>
 struct hip_flatten_indexer : public RAJA::make_policy_pattern_launch_platform_t<
-  RAJA::Policy::hip,
-  RAJA::Pattern::region,
-  detail::get_launch<true /*async */>::value,
-  RAJA::Platform::hip> {
+                                 RAJA::Policy::hip,
+                                 RAJA::Pattern::region,
+                                 detail::get_launch<true /*async */>::value,
+                                 RAJA::Platform::hip>
+{
   using IterationGetter = RAJA::hip::IndexFlatten<_IterationGetters...>;
 };
 
-template <typename _IterationMapping, typename _IterationGetter, typename _LaunchConcretizer,
-          bool Async = false>
+template<typename _IterationMapping,
+         typename _IterationGetter,
+         typename _LaunchConcretizer,
+         bool Async = false>
 struct hip_exec : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::forall,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-  using IterationMapping = _IterationMapping;
-  using IterationGetter = _IterationGetter;
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::forall,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{
+  using IterationMapping  = _IterationMapping;
+  using IterationGetter   = _IterationGetter;
   using LaunchConcretizer = _LaunchConcretizer;
 };
 
-template <bool Async, int num_threads = named_usage::unspecified>
+template<bool Async, int num_threads = named_usage::unspecified>
 struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
-
+                          RAJA::Policy::hip,
+                          RAJA::Pattern::region,
+                          detail::get_launch<Async>::value,
+                          RAJA::Platform::hip>
+{};
 
 //
 // NOTE: There is no Index set segment iteration policy for HIP
@@ -376,13 +408,13 @@ struct hip_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
 ///
 /// WorkGroup execution policies
 ///
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_exec,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::hip> {
-};
+                      RAJA::Policy::hip,
+                      RAJA::Pattern::workgroup_exec,
+                      detail::get_launch<Async>::value,
+                      RAJA::Platform::hip>
+{};
 
 /// execute the enqueued loops in an unordered fashion by mapping loops to
 /// blocks in the y direction and loop iterations to threads in the x direction
@@ -390,11 +422,10 @@ struct hip_work : public RAJA::make_policy_pattern_launch_platform_t<
 /// of all the loops
 struct unordered_hip_loop_y_block_iter_x_threadblock_average
     : public RAJA::make_policy_pattern_platform_t<
-                       RAJA::Policy::hip,
-                       RAJA::Pattern::workgroup_order,
-                       RAJA::Platform::hip> {
-};
-
+          RAJA::Policy::hip,
+          RAJA::Pattern::workgroup_order,
+          RAJA::Platform::hip>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -405,36 +436,36 @@ struct unordered_hip_loop_y_block_iter_x_threadblock_average
 ///
 
 
-template < typename tuning >
-struct hip_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+template<typename tuning>
+struct hip_reduce_policy : public RAJA::make_policy_pattern_launch_platform_t<
+                               RAJA::Policy::hip,
+                               RAJA::Pattern::reduce,
+                               detail::get_launch<false>::value,
+                               RAJA::Platform::hip,
+                               std::conditional_t<tuning::consistent,
+                                                  reduce::ordered,
+                                                  reduce::unordered>>
+{};
 
-template < typename tuning >
+template<typename tuning>
 struct hip_multi_reduce_policy
-    : public RAJA::
-          make_policy_pattern_launch_platform_t<RAJA::Policy::hip,
-                                                RAJA::Pattern::multi_reduce,
-                                                detail::get_launch<false>::value,
-                                                RAJA::Platform::hip,
-                                                std::conditional_t<tuning::consistent,
-                                                                   reduce::ordered,
-                                                                   reduce::unordered>> {
-};
+    : public RAJA::make_policy_pattern_launch_platform_t<
+          RAJA::Policy::hip,
+          RAJA::Pattern::multi_reduce,
+          detail::get_launch<false>::value,
+          RAJA::Platform::hip,
+          std::conditional_t<tuning::consistent,
+                             reduce::ordered,
+                             reduce::unordered>>
+{};
 
 /*!
  * Hip atomic policy for using hip atomics on the device and
  * the provided policy on the host
  */
 template<typename host_policy>
-struct hip_atomic_explicit{};
+struct hip_atomic_explicit
+{};
 
 /*!
  * Default hip atomic policy uses hip atomics on the device and non-atomics
@@ -442,14 +473,15 @@ struct hip_atomic_explicit{};
  */
 using hip_atomic = hip_atomic_explicit<seq_atomic>;
 
-
 // Policy for RAJA::statement::Reduce that reduces threads in a block
 // down to threadIdx 0
-struct hip_block_reduce{};
+struct hip_block_reduce
+{};
 
 // Policy for RAJA::statement::Reduce that reduces threads in a warp
 // down to the first lane of the warp
-struct hip_warp_reduce{};
+struct hip_warp_reduce
+{};
 
 // Policy to map work directly to threads within a warp
 // Maximum iteration count is WARP_SIZE
@@ -463,7 +495,6 @@ struct hip_warp_reduce{};
 // struct hip_warp_loop{};
 
 
-
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
 // Multiple warps have to be created by using hip_thread_{yz}_*
@@ -471,7 +502,8 @@ struct hip_warp_reduce{};
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
 template<typename Mask>
-struct hip_warp_masked_direct {};
+struct hip_warp_masked_direct
+{};
 
 // Policy to map work to threads within a warp using a bit mask
 // Cannot be used in conjunction with hip_thread_x_*
@@ -480,82 +512,84 @@ struct hip_warp_masked_direct {};
 // hip_warp_masked
 // can be used to create complex thread interleaving patterns
 template<typename Mask>
-struct hip_warp_masked_loop {};
-
+struct hip_warp_masked_loop
+{};
 
 template<typename Mask>
-struct hip_thread_masked_direct {};
+struct hip_thread_masked_direct
+{};
 
 template<typename Mask>
-struct hip_thread_masked_loop {};
-
+struct hip_thread_masked_loop
+{};
 
 struct hip_synchronize : make_policy_pattern_launch_t<Policy::hip,
-                                                       Pattern::synchronize,
-                                                       Launch::sync> {
-};
+                                                      Pattern::synchronize,
+                                                      Launch::sync>
+{};
 
 }  // end namespace hip
 }  // end namespace policy
 
-
 namespace internal
 {
 
 RAJA_INLINE
 int get_size(hip_dim_t dims)
 {
-  if(dims.x == 0 && dims.y == 0 && dims.z == 0){
+  if (dims.x == 0 && dims.y == 0 && dims.z == 0)
+  {
     return 0;
   }
-  return (dims.x ? dims.x : 1) *
-         (dims.y ? dims.y : 1) *
-         (dims.z ? dims.z : 1);
+  return (dims.x ? dims.x : 1) * (dims.y ? dims.y : 1) * (dims.z ? dims.z : 1);
 }
 
-struct HipDims {
+struct HipDims
+{
 
-  hip_dim_t blocks{0,0,0};
-  hip_dim_t threads{0,0,0};
+  hip_dim_t blocks {0, 0, 0};
+  hip_dim_t threads {0, 0, 0};
 
-  HipDims() = default;
-  HipDims(HipDims const&) = default;
+  HipDims()                          = default;
+  HipDims(HipDims const&)            = default;
   HipDims& operator=(HipDims const&) = default;
 
   RAJA_INLINE
   HipDims(hip_dim_member_t default_val)
-    : blocks{default_val, default_val, default_val}
-    , threads{default_val, default_val, default_val}
-  { }
+      : blocks {default_val, default_val, default_val},
+        threads {default_val, default_val, default_val}
+  {}
 
   RAJA_INLINE
-  int num_blocks() const {
-    return get_size(blocks);
-  }
+  int num_blocks() const { return get_size(blocks); }
 
   RAJA_INLINE
-  int num_threads() const {
-    return get_size(threads);
-  }
+  int num_threads() const { return get_size(threads); }
 
   RAJA_INLINE
-  hip_dim_t get_blocks() const {
-    if (num_blocks() != 0) {
-      return {(blocks.x ? blocks.x : 1),
-              (blocks.y ? blocks.y : 1),
+  hip_dim_t get_blocks() const
+  {
+    if (num_blocks() != 0)
+    {
+      return {(blocks.x ? blocks.x : 1), (blocks.y ? blocks.y : 1),
               (blocks.z ? blocks.z : 1)};
-    } else {
+    }
+    else
+    {
       return blocks;
     }
   }
 
   RAJA_INLINE
-  hip_dim_t get_threads() const {
-    if (num_threads() != 0) {
-      return {(threads.x ? threads.x : 1),
-              (threads.y ? threads.y : 1),
+  hip_dim_t get_threads() const
+  {
+    if (num_threads() != 0)
+    {
+      return {(threads.x ? threads.x : 1), (threads.y ? threads.y : 1),
               (threads.z ? threads.z : 1)};
-    } else {
+    }
+    else
+    {
       return threads;
     }
   }
@@ -565,81 +599,69 @@ template<named_dim dim>
 struct HipDimHelper;
 
 template<>
-struct HipDimHelper<named_dim::x>{
+struct HipDimHelper<named_dim::x>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.x;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.x = value;
   }
 };
 
 template<>
-struct HipDimHelper<named_dim::y>{
+struct HipDimHelper<named_dim::y>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.y;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.y = value;
   }
 };
 
 template<>
-struct HipDimHelper<named_dim::z>{
+struct HipDimHelper<named_dim::z>
+{
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static constexpr
-  hip_dim_member_t get(dim_t const &d)
+  RAJA_HOST_DEVICE inline static constexpr hip_dim_member_t get(dim_t const& d)
   {
     return d.z;
   }
 
   template<typename dim_t>
-  RAJA_HOST_DEVICE
-  inline static
-  void set(dim_t &d, hip_dim_member_t value)
+  RAJA_HOST_DEVICE inline static void set(dim_t& d, hip_dim_member_t value)
   {
     d.z = value;
   }
 };
 
 template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-constexpr
-hip_dim_member_t get_hip_dim(dim_t const &d)
+RAJA_HOST_DEVICE constexpr hip_dim_member_t get_hip_dim(dim_t const& d)
 {
   return HipDimHelper<dim>::get(d);
 }
 
 template<named_dim dim, typename dim_t>
-RAJA_HOST_DEVICE
-void set_hip_dim(dim_t &d, hip_dim_member_t value)
+RAJA_HOST_DEVICE void set_hip_dim(dim_t& d, hip_dim_member_t value)
 {
   return HipDimHelper<dim>::set(d, value);
 }
 
-} // namespace internal
+}  // namespace internal
 
 namespace hip
 {
@@ -648,14 +670,14 @@ namespace hip
 struct IndexSize
 {
   hip_dim_member_t block_size = named_usage::unspecified;
-  hip_dim_member_t grid_size = named_usage::unspecified;
-
-  RAJA_HOST_DEVICE constexpr
-  IndexSize(hip_dim_member_t _block_size = named_usage::unspecified,
-            hip_dim_member_t _grid_size = named_usage::unspecified)
-    : block_size(_block_size)
-    , grid_size(_grid_size)
-  { }
+  hip_dim_member_t grid_size  = named_usage::unspecified;
+
+  RAJA_HOST_DEVICE constexpr IndexSize(
+      hip_dim_member_t _block_size = named_usage::unspecified,
+      hip_dim_member_t _grid_size  = named_usage::unspecified)
+      : block_size(_block_size),
+        grid_size(_grid_size)
+  {}
 };
 
 /// Type representing thread indexing within a grid
@@ -670,23 +692,25 @@ struct IndexGlobal
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(block_size) * static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed block size of 1 and fixed grid size
 template<named_dim dim, int GRID_SIZE>
 struct IndexGlobal<dim, 1, GRID_SIZE>
@@ -694,20 +718,22 @@ struct IndexGlobal<dim, 1, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = 1;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed block size and fixed grid size of 1
 template<named_dim dim, int BLOCK_SIZE>
 struct IndexGlobal<dim, BLOCK_SIZE, 1>
@@ -715,37 +741,39 @@ struct IndexGlobal<dim, BLOCK_SIZE, 1>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
+
 /// with fixed block size and fixed grid size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, 1, 1>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
@@ -756,40 +784,47 @@ struct IndexGlobal<dim, named_usage::unspecified, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(grid_size);
   }
 };
+
 /// with dynamic block size and fixed grid size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, 1>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
@@ -800,40 +835,44 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::unspecified>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
            static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(block_size) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
+
 /// with fixed block size of 1 and dynamic grid size
 template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::unspecified>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -842,21 +881,25 @@ template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) +
+           static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+               static_cast<IdxT>(
+                   ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
-           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(
+               ::RAJA::internal::HipDimHelper<dim>::get(blockDim)) *
+           static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -868,56 +911,60 @@ struct IndexGlobal<dim, named_usage::ignored, GRID_SIZE>
   static_assert(GRID_SIZE > 0, "grid size must not be negative");
 
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = GRID_SIZE;
+  static constexpr int grid_size  = GRID_SIZE;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(grid_size) ;
+    return static_cast<IdxT>(grid_size);
   }
 };
+
 /// with fixed grid sized of 1
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, 1>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = 1;
+  static constexpr int grid_size  = 1;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
+
 /// with dynamic grid size
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::unspecified>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::unspecified;
+  static constexpr int grid_size  = named_usage::unspecified;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim)) ;
+    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(gridDim));
   }
 };
 
@@ -929,56 +976,61 @@ struct IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>
   static_assert(BLOCK_SIZE > 0, "block size must not be negative");
 
   static constexpr int block_size = BLOCK_SIZE;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static constexpr IdxT size()
   {
-    return static_cast<IdxT>(block_size) ;
+    return static_cast<IdxT>(block_size);
   }
 };
+
 /// with fixed block size of 1
 template<named_dim dim>
 struct IndexGlobal<dim, 1, named_usage::ignored>
 {
   static constexpr int block_size = 1;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
+
 /// with dynamic block size
 template<named_dim dim>
 struct IndexGlobal<dim, named_usage::unspecified, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::unspecified;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(threadIdx)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(threadIdx));
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(::RAJA::internal::HipDimHelper<dim>::get(blockDim)) ;
+    return static_cast<IdxT>(
+        ::RAJA::internal::HipDimHelper<dim>::get(blockDim));
   }
 };
 
@@ -988,18 +1040,18 @@ template<named_dim dim>
 struct IndexGlobal<dim, named_usage::ignored, named_usage::ignored>
 {
   static constexpr int block_size = named_usage::ignored;
-  static constexpr int grid_size = named_usage::ignored;
+  static constexpr int grid_size  = named_usage::ignored;
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
-    return static_cast<IdxT>(0) ;
+    return static_cast<IdxT>(0);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return static_cast<IdxT>(1) ;
+    return static_cast<IdxT>(1);
   }
 };
 
@@ -1008,19 +1060,18 @@ template<typename x_index>
 struct IndexFlatten<x_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>();
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>();
+    return x_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y)
@@ -1028,21 +1079,19 @@ template<typename x_index, typename y_index>
 struct IndexFlatten<x_index, y_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>());
-
+           x_index::template size<IdxT>() * (y_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>();
   }
-
 };
 
 // useful for flatten global index (includes x,y,z)
@@ -1050,49 +1099,52 @@ template<typename x_index, typename y_index, typename z_index>
 struct IndexFlatten<x_index, y_index, z_index>
 {
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
 
     return x_index::template index<IdxT>() +
-      x_index::template size<IdxT>() * ( y_index::template index<IdxT>() +
-                                         y_index::template size<IdxT>() * z_index::template index<IdxT>());
+           x_index::template size<IdxT>() *
+               (y_index::template index<IdxT>() +
+                y_index::template size<IdxT>() *
+                    z_index::template index<IdxT>());
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return  x_index::template size<IdxT>() * y_index::template size<IdxT> () * z_index::template size<IdxT> ();
+    return x_index::template size<IdxT>() * y_index::template size<IdxT>() *
+           z_index::template size<IdxT>();
   }
-
 };
 
 template<size_t divisor, typename indexer>
 struct IndexDivide
 {
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() / static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
-    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(), static_cast<IdxT>(divisor));
+    return RAJA_DIVIDE_CEILING_INT(indexer::template size<IdxT>(),
+                                   static_cast<IdxT>(divisor));
   }
 };
 
 template<size_t divisor, typename indexer>
 struct IndexModulo
 {
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT index()
   {
     return indexer::template index<IdxT>() % static_cast<IdxT>(divisor);
   }
 
-  template < typename IdxT = hip_dim_member_t >
+  template<typename IdxT = hip_dim_member_t>
   RAJA_DEVICE static inline IdxT size()
   {
     return static_cast<IdxT>(divisor);
@@ -1101,16 +1153,18 @@ struct IndexModulo
 
 
 // helper to get just the thread indexing part of IndexGlobal
-template < typename index_global >
+template<typename index_global>
 struct get_index_thread;
+
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_thread<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, BLOCK_SIZE, named_usage::ignored>;
 };
+
 ///
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 {
   using type = IndexFlatten<typename get_index_thread<x_index>::type,
@@ -1119,16 +1173,18 @@ struct get_index_thread<IndexFlatten<x_index, y_index, z_index>>
 };
 
 // helper to get just the block indexing part of IndexGlobal
-template < typename index_global >
+template<typename index_global>
 struct get_index_block;
+
 ///
-template < named_dim dim, int BLOCK_SIZE, int GRID_SIZE >
+template<named_dim dim, int BLOCK_SIZE, int GRID_SIZE>
 struct get_index_block<IndexGlobal<dim, BLOCK_SIZE, GRID_SIZE>>
 {
   using type = IndexGlobal<dim, named_usage::ignored, GRID_SIZE>;
 };
+
 ///
-template <typename x_index, typename y_index, typename z_index>
+template<typename x_index, typename y_index, typename z_index>
 struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
 {
   using type = IndexFlatten<typename get_index_block<x_index>::type,
@@ -1136,179 +1192,205 @@ struct get_index_block<IndexFlatten<x_index, y_index, z_index>>
                             typename get_index_block<z_index>::type>;
 };
 
-
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_x = IndexGlobal<named_dim::x, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_y = IndexGlobal<named_dim::y, BLOCK_SIZE, named_usage::ignored>;
-template <size_t BLOCK_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE = named_usage::unspecified>
 using thread_z = IndexGlobal<named_dim::z, BLOCK_SIZE, named_usage::ignored>;
 
-template <size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
+template<size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified>
 using thread_xyz = IndexFlatten<thread_x<BLOCK_SIZE_X>,
                                 thread_y<BLOCK_SIZE_Y>,
                                 thread_z<BLOCK_SIZE_Z>>;
 
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_x = IndexGlobal<named_dim::x, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_y = IndexGlobal<named_dim::y, named_usage::ignored, GRID_SIZE>;
-template <size_t GRID_SIZE=named_usage::unspecified>
+template<size_t GRID_SIZE = named_usage::unspecified>
 using block_z = IndexGlobal<named_dim::z, named_usage::ignored, GRID_SIZE>;
 
-template <size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template<size_t GRID_SIZE_X = named_usage::unspecified,
+         size_t GRID_SIZE_Y = named_usage::unspecified,
+         size_t GRID_SIZE_Z = named_usage::unspecified>
 using block_xyz = IndexFlatten<block_x<GRID_SIZE_X>,
                                block_y<GRID_SIZE_Y>,
                                block_z<GRID_SIZE_Z>>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_x = IndexGlobal<named_dim::x, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_y = IndexGlobal<named_dim::y, BLOCK_SIZE, GRID_SIZE>;
-template <size_t BLOCK_SIZE, size_t GRID_SIZE=named_usage::unspecified>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE = named_usage::unspecified>
 using global_z = IndexGlobal<named_dim::z, BLOCK_SIZE, GRID_SIZE>;
 
 
-template <size_t BLOCK_SIZE_X,
-          size_t BLOCK_SIZE_Y,
-          size_t BLOCK_SIZE_Z,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
+template<size_t BLOCK_SIZE_X,
+         size_t BLOCK_SIZE_Y,
+         size_t BLOCK_SIZE_Z,
+         size_t GRID_SIZE_X = named_usage::unspecified,
+         size_t GRID_SIZE_Y = named_usage::unspecified,
+         size_t GRID_SIZE_Z = named_usage::unspecified>
 using global_xyz = IndexFlatten<global_x<BLOCK_SIZE_X, GRID_SIZE_X>,
                                 global_y<BLOCK_SIZE_Y, GRID_SIZE_Y>,
                                 global_z<BLOCK_SIZE_Z, GRID_SIZE_Z>>;
 
 
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified>
-using warp_xyz = IndexDivide<WARP_SIZE,
-                             thread_xyz<BLOCK_SIZE_X,
-                                        BLOCK_SIZE_Y,
-                                        BLOCK_SIZE_Z>>;
-
-template <size_t WARP_SIZE=RAJA::policy::hip::device_constants.WARP_SIZE,
-          size_t BLOCK_SIZE_X=named_usage::unspecified,
-          size_t BLOCK_SIZE_Y=named_usage::unspecified,
-          size_t BLOCK_SIZE_Z=named_usage::unspecified,
-          size_t GRID_SIZE_X=named_usage::unspecified,
-          size_t GRID_SIZE_Y=named_usage::unspecified,
-          size_t GRID_SIZE_Z=named_usage::unspecified>
-using warp_global_xyz = IndexFlatten<warp_xyz<WARP_SIZE,
-                                              BLOCK_SIZE_X,
-                                              BLOCK_SIZE_Y,
-                                              BLOCK_SIZE_Z>,
-                                     block_xyz<GRID_SIZE_X,
-                                               GRID_SIZE_Y,
-                                               GRID_SIZE_Z>>;
-
-} // namespace hip
+template<size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+         size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified>
+using warp_xyz =
+    IndexDivide<WARP_SIZE,
+                thread_xyz<BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>>;
+
+template<size_t WARP_SIZE    = RAJA::policy::hip::device_constants.WARP_SIZE,
+         size_t BLOCK_SIZE_X = named_usage::unspecified,
+         size_t BLOCK_SIZE_Y = named_usage::unspecified,
+         size_t BLOCK_SIZE_Z = named_usage::unspecified,
+         size_t GRID_SIZE_X  = named_usage::unspecified,
+         size_t GRID_SIZE_Y  = named_usage::unspecified,
+         size_t GRID_SIZE_Z  = named_usage::unspecified>
+using warp_global_xyz =
+    IndexFlatten<warp_xyz<WARP_SIZE, BLOCK_SIZE_X, BLOCK_SIZE_Y, BLOCK_SIZE_Z>,
+                 block_xyz<GRID_SIZE_X, GRID_SIZE_Y, GRID_SIZE_Z>>;
+
+}  // namespace hip
 
 // contretizers used in forall, scan, and sort policies
 
-using HipAvoidDeviceMaxThreadOccupancyConcretizer = hip::AvoidDeviceMaxThreadOccupancyConcretizer<hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
+using HipAvoidDeviceMaxThreadOccupancyConcretizer =
+    hip::AvoidDeviceMaxThreadOccupancyConcretizer<
+        hip::FractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 1>, -1>>;
 
-template < typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET >
-using HipFractionOffsetOccupancyConcretizer = hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
+template<typename Fraction, std::ptrdiff_t BLOCKS_PER_SM_OFFSET>
+using HipFractionOffsetOccupancyConcretizer =
+    hip::FractionOffsetOccupancyConcretizer<Fraction, BLOCKS_PER_SM_OFFSET>;
 
 using HipMaxOccupancyConcretizer = hip::MaxOccupancyConcretizer;
 
-using HipReduceDefaultConcretizer = HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
+using HipReduceDefaultConcretizer =
+    HipFractionOffsetOccupancyConcretizer<Fraction<size_t, 1, 2>, 0>;
 
 using HipDefaultConcretizer = HipAvoidDeviceMaxThreadOccupancyConcretizer;
 
 // policies usable with forall, scan, and sort
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE, bool Async = false>
 using hip_exec_grid = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
-template <size_t BLOCK_SIZE, size_t GRID_SIZE>
+template<size_t BLOCK_SIZE, size_t GRID_SIZE>
 using hip_exec_grid_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE, GRID_SIZE>,
-    HipDefaultConcretizer, true>;
-
-template <size_t BLOCK_SIZE, bool Async = false>
-using hip_exec = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
-
-template <size_t BLOCK_SIZE>
-using hip_exec_async = policy::hip::hip_exec<
-    iteration_mapping::Direct, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
-
-template <size_t BLOCK_SIZE, bool Async = false>
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE, GRID_SIZE>,
+    HipDefaultConcretizer,
+    true>;
+
+template<size_t BLOCK_SIZE, bool Async = false>
+using hip_exec = policy::hip::hip_exec<iteration_mapping::Direct,
+                                       hip::global_x<BLOCK_SIZE>,
+                                       HipDefaultConcretizer,
+                                       Async>;
+
+template<size_t BLOCK_SIZE>
+using hip_exec_async = policy::hip::hip_exec<iteration_mapping::Direct,
+                                             hip::global_x<BLOCK_SIZE>,
+                                             HipDefaultConcretizer,
+                                             true>;
+
+template<size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_calc = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using hip_exec_occ_calc_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipDefaultConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipDefaultConcretizer,
+    true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_occ_max = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using hip_exec_occ_max_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipMaxOccupancyConcretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipMaxOccupancyConcretizer,
+    true>;
 
-template <size_t BLOCK_SIZE, typename Fraction, bool Async = false>
+template<size_t BLOCK_SIZE, typename Fraction, bool Async = false>
 using hip_exec_occ_fraction = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    Async>;
 
-template <size_t BLOCK_SIZE, typename Fraction>
+template<size_t BLOCK_SIZE, typename Fraction>
 using hip_exec_occ_fraction_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipFractionOffsetOccupancyConcretizer<Fraction, 0>, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipFractionOffsetOccupancyConcretizer<Fraction, 0>,
+    true>;
 
-template <size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
+template<size_t BLOCK_SIZE, typename Concretizer, bool Async = false>
 using hip_exec_occ_custom = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    Async>;
 
-template <size_t BLOCK_SIZE, typename Concretizer>
+template<size_t BLOCK_SIZE, typename Concretizer>
 using hip_exec_occ_custom_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    Concretizer, true>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    Concretizer,
+    true>;
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 using hip_exec_with_reduce = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, Async>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    Async>;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using hip_exec_with_reduce_async = policy::hip::hip_exec<
-    iteration_mapping::StridedLoop<named_usage::unspecified>, hip::global_x<BLOCK_SIZE>,
-    HipReduceDefaultConcretizer, true>;
-
-template <bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
-using hip_exec_base = std::conditional_t<with_reduce,
-    hip_exec_with_reduce<BLOCK_SIZE, Async>,
-    hip_exec<BLOCK_SIZE, Async>>;
-
-template <bool with_reduce, size_t BLOCK_SIZE>
-using hip_exec_base_async = std::conditional_t<with_reduce,
-    hip_exec_with_reduce_async<BLOCK_SIZE>,
-    hip_exec_async<BLOCK_SIZE>>;
+    iteration_mapping::StridedLoop<named_usage::unspecified>,
+    hip::global_x<BLOCK_SIZE>,
+    HipReduceDefaultConcretizer,
+    true>;
+
+template<bool with_reduce, size_t BLOCK_SIZE, bool Async = false>
+using hip_exec_base =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce<BLOCK_SIZE, Async>,
+                       hip_exec<BLOCK_SIZE, Async>>;
+
+template<bool with_reduce, size_t BLOCK_SIZE>
+using hip_exec_base_async =
+    std::conditional_t<with_reduce,
+                       hip_exec_with_reduce_async<BLOCK_SIZE>,
+                       hip_exec_async<BLOCK_SIZE>>;
 
 // policies usable with WorkGroup
 using policy::hip::hip_work;
 
-template <size_t BLOCK_SIZE>
+template<size_t BLOCK_SIZE>
 using hip_work_async = policy::hip::hip_work<BLOCK_SIZE, true>;
 
 using policy::hip::unordered_hip_loop_y_block_iter_x_threadblock_average;
@@ -1319,10 +1401,10 @@ using policy::hip::hip_atomic_explicit;
 
 
 // policies usable with reducers
-template < hip::reduce_algorithm algorithm,
-           hip::block_communication_mode comm_mode,
-           size_t replication = named_usage::unspecified,
-           size_t atomic_stride = named_usage::unspecified >
+template<hip::reduce_algorithm algorithm,
+         hip::block_communication_mode comm_mode,
+         size_t replication   = named_usage::unspecified,
+         size_t atomic_stride = named_usage::unspecified>
 using hip_reduce_tuning = policy::hip::hip_reduce_policy<
     hip::ReduceTuning<algorithm, comm_mode, replication, atomic_stride>>;
 
@@ -1345,35 +1427,41 @@ using hip_reduce_tuning = policy::hip::hip_reduce_policy<
 //                 a cache shared by the whole device to avoid having to use
 //                 device scope fences. This improves performance on some HW but
 //                 is more difficult to code correctly.
-using hip_reduce_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::combine_last_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::combine_last_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_device_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_device_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_device_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_device_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_device_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::device_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_device_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::device_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 ///
-using hip_reduce_atomic_host_init_block_fence = hip_reduce_tuning<
-    hip::reduce_algorithm::init_host_combine_atomic_block,
-    hip::block_communication_mode::block_fence,
-    named_usage::unspecified, named_usage::unspecified>;
+using hip_reduce_atomic_host_init_block_fence =
+    hip_reduce_tuning<hip::reduce_algorithm::init_host_combine_atomic_block,
+                      hip::block_communication_mode::block_fence,
+                      named_usage::unspecified,
+                      named_usage::unspecified>;
 
 // Policy for RAJA::Reduce* objects that gives the same answer every time when
 // used in the same way
@@ -1385,25 +1473,26 @@ using hip_reduce_atomic = hip_reduce_atomic_host_init_block_fence;
 
 // Policy for RAJA::Reduce* objects that lets you select the default atomic or
 // non-atomic policy with a bool
-template < bool with_atomic >
-using hip_reduce_base = std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
+template<bool with_atomic>
+using hip_reduce_base =
+    std::conditional_t<with_atomic, hip_reduce_atomic, hip_reduce>;
 
 
 // policies usable with multi_reducers
-template < hip::multi_reduce_algorithm algorithm,
-           typename SharedAtomicReplicationConcretizer,
-           typename SharedAtomicReplicationIndexer,
-           typename GlobalAtomicReplicationConcretizer,
-           typename GlobalAtomicReplicationIndexer >
-using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
-    hip::MultiReduceTuning<
-      algorithm,
-      hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
-                                    SharedAtomicReplicationIndexer,
-                                    GetOffsetRight<int>>,
-      hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
-                                    GlobalAtomicReplicationIndexer,
-                                    GetOffsetLeft<int>>>>;
+template<hip::multi_reduce_algorithm algorithm,
+         typename SharedAtomicReplicationConcretizer,
+         typename SharedAtomicReplicationIndexer,
+         typename GlobalAtomicReplicationConcretizer,
+         typename GlobalAtomicReplicationIndexer>
+using hip_multi_reduce_tuning =
+    policy::hip::hip_multi_reduce_policy<hip::MultiReduceTuning<
+        algorithm,
+        hip::AtomicReplicationTuning<SharedAtomicReplicationConcretizer,
+                                     SharedAtomicReplicationIndexer,
+                                     GetOffsetRight<int>>,
+        hip::AtomicReplicationTuning<GlobalAtomicReplicationConcretizer,
+                                     GlobalAtomicReplicationIndexer,
+                                     GetOffsetLeft<int>>>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - *atomic* policies may use atomics to combine partial results. The
@@ -1416,44 +1505,51 @@ using hip_multi_reduce_tuning = policy::hip::hip_multi_reduce_policy<
 // - *host_init* policies initialize memory used with atomics on the host.
 //   This is faster overall than other policies on HW with direct host access
 //   to device memory such as the AMD MI300A El Capitan/Tuolumne systems.
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<4>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<4>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 // special policy to test that multi-reducers work if there is not enough shmem
-using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_block_atomic_then_grid_atomic,
-    hip::SharedAtomicReplicationMaxPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<0>>,
-    hip::thread_xyz<>,
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<32>>,
-    hip::warp_global_xyz<>>;
+using hip_multi_reduce_atomic_block_then_atomic_grid_host_init_fallback_testing =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::
+            init_host_combine_block_atomic_then_grid_atomic,
+        hip::SharedAtomicReplicationMaxPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<0>>,
+        hip::thread_xyz<>,
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<32>>,
+        hip::warp_global_xyz<>>;
 //
 using hip_multi_reduce_atomic_global_host_init = hip_multi_reduce_tuning<
     hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
+    void,  // unused with this algorithm
+    void,  // unused with this algorithm
     hip::GlobalAtomicReplicationMinPow2Concretizer<
         hip::ConstantPreferredReplicationConcretizer<32>>,
     hip::warp_global_xyz<>>;
 //
-using hip_multi_reduce_atomic_global_no_replication_host_init = hip_multi_reduce_tuning<
-    hip::multi_reduce_algorithm::init_host_combine_global_atomic,
-    void, // unused with this algorithm
-    void, // unused with this algorithm
-    hip::GlobalAtomicReplicationMinPow2Concretizer<
-        hip::ConstantPreferredReplicationConcretizer<1>>,
-    hip::block_xyz<>>;
-
-// Policy for RAJA::MultiReduce* objects that may use atomics and may not give the
-// same answer every time when used in the same way
-using hip_multi_reduce_atomic = hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
-// Similar to above but optimized for low overhead in cases where it is rarely used
+using hip_multi_reduce_atomic_global_no_replication_host_init =
+    hip_multi_reduce_tuning<
+        hip::multi_reduce_algorithm::init_host_combine_global_atomic,
+        void,  // unused with this algorithm
+        void,  // unused with this algorithm
+        hip::GlobalAtomicReplicationMinPow2Concretizer<
+            hip::ConstantPreferredReplicationConcretizer<1>>,
+        hip::block_xyz<>>;
+
+// Policy for RAJA::MultiReduce* objects that may use atomics and may not give
+// the same answer every time when used in the same way
+using hip_multi_reduce_atomic =
+    hip_multi_reduce_atomic_block_then_atomic_grid_host_init;
+// Similar to above but optimized for low overhead in cases where it is rarely
+// used
 using hip_multi_reduce_atomic_low_performance_low_overhead =
     hip_multi_reduce_atomic_global_no_replication_host_init;
 
@@ -1485,31 +1581,31 @@ using policy::hip::hip_launch_t;
 
 
 // policies usable with kernel and launch
-template < typename ... indexers >
-using hip_indexer_direct = policy::hip::hip_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template<typename... indexers>
+using hip_indexer_direct =
+    policy::hip::hip_indexer<iteration_mapping::Direct,
+                             kernel_sync_requirement::none,
+                             indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using hip_indexer_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
     indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using hip_indexer_syncable_loop = policy::hip::hip_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::sync,
     indexers...>;
 
-template < typename ... indexers >
-using hip_flatten_indexer_direct = policy::hip::hip_flatten_indexer<
-    iteration_mapping::Direct,
-    kernel_sync_requirement::none,
-    indexers...>;
+template<typename... indexers>
+using hip_flatten_indexer_direct =
+    policy::hip::hip_flatten_indexer<iteration_mapping::Direct,
+                                     kernel_sync_requirement::none,
+                                     indexers...>;
 
-template < typename ... indexers >
+template<typename... indexers>
 using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
     iteration_mapping::StridedLoop<named_usage::unspecified>,
     kernel_sync_requirement::none,
@@ -1522,7 +1618,7 @@ using hip_flatten_indexer_loop = policy::hip::hip_flatten_indexer<
  * For example, a segment of size 2000 will not fit, and trigger a runtime
  * error.
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_thread_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1537,22 +1633,28 @@ using hip_thread_yz_direct = hip_thread_direct<named_dim::y, named_dim::z>;
 using hip_thread_zx_direct = hip_thread_direct<named_dim::z, named_dim::x>;
 using hip_thread_zy_direct = hip_thread_direct<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_direct = hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_direct = hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_direct = hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_direct = hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_direct = hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_direct = hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_direct =
+    hip_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_direct =
+    hip_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_direct =
+    hip_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_direct =
+    hip_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_direct =
+    hip_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_direct =
+    hip_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP threads.
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_thread_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_thread_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1567,12 +1669,18 @@ using hip_thread_yz_loop = hip_thread_loop<named_dim::y, named_dim::z>;
 using hip_thread_zx_loop = hip_thread_loop<named_dim::z, named_dim::x>;
 using hip_thread_zy_loop = hip_thread_loop<named_dim::z, named_dim::y>;
 
-using hip_thread_xyz_loop = hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_thread_xzy_loop = hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_thread_yxz_loop = hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_thread_yzx_loop = hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_thread_zxy_loop = hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_thread_xyz_loop =
+    hip_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_thread_xzy_loop =
+    hip_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_thread_yxz_loop =
+    hip_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_thread_yzx_loop =
+    hip_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_thread_zxy_loop =
+    hip_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_thread_zyx_loop =
+    hip_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
@@ -1580,7 +1688,7 @@ using hip_thread_zyx_loop = hip_thread_loop<named_dim::z, named_dim::y, named_di
  * physical threads to fit all of the direct map requests.
  * Reshapes multiple physical threads into a 1D iteration space
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_flatten_thread_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1588,26 +1696,38 @@ using hip_flatten_thread_x_direct = hip_flatten_thread_direct<named_dim::x>;
 using hip_flatten_thread_y_direct = hip_flatten_thread_direct<named_dim::y>;
 using hip_flatten_thread_z_direct = hip_flatten_thread_direct<named_dim::z>;
 
-using hip_flatten_thread_xy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_direct = hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_direct = hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_direct = hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_direct = hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_direct = hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_direct = hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_direct =
+    hip_flatten_thread_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_direct =
+    hip_flatten_thread_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_direct =
+    hip_flatten_thread_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP threads.
  * Reshapes multiple physical threads into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical threads
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_flatten_thread_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::unspecified, named_usage::ignored>...>;
 
@@ -1615,19 +1735,31 @@ using hip_flatten_thread_x_loop = hip_flatten_thread_loop<named_dim::x>;
 using hip_flatten_thread_y_loop = hip_flatten_thread_loop<named_dim::y>;
 using hip_flatten_thread_z_loop = hip_flatten_thread_loop<named_dim::z>;
 
-using hip_flatten_thread_xy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y>;
-using hip_flatten_thread_xz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z>;
-using hip_flatten_thread_yx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x>;
-using hip_flatten_thread_yz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z>;
-using hip_flatten_thread_zx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x>;
-using hip_flatten_thread_zy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_thread_xyz_loop = hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_thread_xzy_loop = hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_thread_yxz_loop = hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_thread_yzx_loop = hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_thread_zxy_loop = hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_thread_xy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y>;
+using hip_flatten_thread_xz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z>;
+using hip_flatten_thread_yx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x>;
+using hip_flatten_thread_yz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z>;
+using hip_flatten_thread_zx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x>;
+using hip_flatten_thread_zy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_thread_xyz_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_thread_xzy_loop =
+    hip_flatten_thread_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_thread_yxz_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_thread_yzx_loop =
+    hip_flatten_thread_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_thread_zxy_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_thread_zyx_loop =
+    hip_flatten_thread_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1635,7 +1767,7 @@ using hip_flatten_thread_zyx_loop = hip_flatten_thread_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical blocks to fit all of the direct map requests.
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_block_direct = hip_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1650,22 +1782,28 @@ using hip_block_yz_direct = hip_block_direct<named_dim::y, named_dim::z>;
 using hip_block_zx_direct = hip_block_direct<named_dim::z, named_dim::x>;
 using hip_block_zy_direct = hip_block_direct<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_direct = hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_direct = hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_direct = hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_direct = hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_direct = hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_direct = hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_direct =
+    hip_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_direct =
+    hip_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_direct =
+    hip_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_direct =
+    hip_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_direct =
+    hip_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_direct =
+    hip_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP blocks.
  * Uses grid-stride looping to exceed the maximum number of blocks
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_block_loop = hip_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_block_syncable_loop = hip_indexer_syncable_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1680,12 +1818,18 @@ using hip_block_yz_loop = hip_block_loop<named_dim::y, named_dim::z>;
 using hip_block_zx_loop = hip_block_loop<named_dim::z, named_dim::x>;
 using hip_block_zy_loop = hip_block_loop<named_dim::z, named_dim::y>;
 
-using hip_block_xyz_loop = hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_block_xzy_loop = hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_block_yxz_loop = hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_block_yzx_loop = hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_block_zxy_loop = hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_block_xyz_loop =
+    hip_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_block_xzy_loop =
+    hip_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_block_yxz_loop =
+    hip_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_block_yzx_loop =
+    hip_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_block_zxy_loop =
+    hip_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_block_zyx_loop =
+    hip_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
@@ -1693,7 +1837,7 @@ using hip_block_zyx_loop = hip_block_loop<named_dim::z, named_dim::y, named_dim:
  * physical blocks to fit all of the direct map requests.
  * Reshapes multiple physical blocks into a 1D iteration space
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_flatten_block_direct = hip_flatten_indexer_direct<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1701,26 +1845,38 @@ using hip_flatten_block_x_direct = hip_flatten_block_direct<named_dim::x>;
 using hip_flatten_block_y_direct = hip_flatten_block_direct<named_dim::y>;
 using hip_flatten_block_z_direct = hip_flatten_block_direct<named_dim::z>;
 
-using hip_flatten_block_xy_direct = hip_flatten_block_direct<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_direct = hip_flatten_block_direct<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_direct = hip_flatten_block_direct<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_direct = hip_flatten_block_direct<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_direct = hip_flatten_block_direct<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_direct = hip_flatten_block_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_direct = hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_direct = hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_direct = hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_direct = hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_direct = hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_direct = hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_direct =
+    hip_flatten_block_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_direct =
+    hip_flatten_block_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_direct =
+    hip_flatten_block_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP blocks.
  * Reshapes multiple physical blocks into a 1D iteration space
  * Uses block-stride looping to exceed the maximum number of physical blocks
  */
-template < named_dim ... dims >
+template<named_dim... dims>
 using hip_flatten_block_loop = hip_flatten_indexer_loop<
     hip::IndexGlobal<dims, named_usage::ignored, named_usage::unspecified>...>;
 
@@ -1728,19 +1884,31 @@ using hip_flatten_block_x_loop = hip_flatten_block_loop<named_dim::x>;
 using hip_flatten_block_y_loop = hip_flatten_block_loop<named_dim::y>;
 using hip_flatten_block_z_loop = hip_flatten_block_loop<named_dim::z>;
 
-using hip_flatten_block_xy_loop = hip_flatten_block_loop<named_dim::x, named_dim::y>;
-using hip_flatten_block_xz_loop = hip_flatten_block_loop<named_dim::x, named_dim::z>;
-using hip_flatten_block_yx_loop = hip_flatten_block_loop<named_dim::y, named_dim::x>;
-using hip_flatten_block_yz_loop = hip_flatten_block_loop<named_dim::y, named_dim::z>;
-using hip_flatten_block_zx_loop = hip_flatten_block_loop<named_dim::z, named_dim::x>;
-using hip_flatten_block_zy_loop = hip_flatten_block_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_block_xyz_loop = hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_block_xzy_loop = hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_block_yxz_loop = hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_block_yzx_loop = hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_block_zxy_loop = hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_block_xy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y>;
+using hip_flatten_block_xz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z>;
+using hip_flatten_block_yx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x>;
+using hip_flatten_block_yz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z>;
+using hip_flatten_block_zx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x>;
+using hip_flatten_block_zy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_block_xyz_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_block_xzy_loop =
+    hip_flatten_block_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_block_yxz_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_block_yzx_loop =
+    hip_flatten_block_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_block_zxy_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_block_zyx_loop =
+    hip_flatten_block_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1748,9 +1916,11 @@ using hip_flatten_block_zyx_loop = hip_flatten_block_loop<named_dim::z, named_di
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < named_dim ... dims >
-using hip_global_direct = hip_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using hip_global_direct =
+    hip_indexer_direct<hip::IndexGlobal<dims,
+                                        named_usage::unspecified,
+                                        named_usage::unspecified>...>;
 
 using hip_global_x_direct = hip_global_direct<named_dim::x>;
 using hip_global_y_direct = hip_global_direct<named_dim::y>;
@@ -1763,24 +1933,34 @@ using hip_global_yz_direct = hip_global_direct<named_dim::y, named_dim::z>;
 using hip_global_zx_direct = hip_global_direct<named_dim::z, named_dim::x>;
 using hip_global_zy_direct = hip_global_direct<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_direct = hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_direct = hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_direct = hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_direct = hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_direct = hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_direct = hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_direct =
+    hip_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_direct =
+    hip_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_direct =
+    hip_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_direct =
+    hip_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_direct =
+    hip_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_direct =
+    hip_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < named_dim ... dims >
-using hip_global_loop = hip_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
-
-template < named_dim ... dims >
-using hip_global_syncable_loop = hip_indexer_syncable_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using hip_global_loop =
+    hip_indexer_loop<hip::IndexGlobal<dims,
+                                      named_usage::unspecified,
+                                      named_usage::unspecified>...>;
+
+template<named_dim... dims>
+using hip_global_syncable_loop =
+    hip_indexer_syncable_loop<hip::IndexGlobal<dims,
+                                               named_usage::unspecified,
+                                               named_usage::unspecified>...>;
 
 using hip_global_x_loop = hip_global_loop<named_dim::x>;
 using hip_global_y_loop = hip_global_loop<named_dim::y>;
@@ -1793,12 +1973,18 @@ using hip_global_yz_loop = hip_global_loop<named_dim::y, named_dim::z>;
 using hip_global_zx_loop = hip_global_loop<named_dim::z, named_dim::x>;
 using hip_global_zy_loop = hip_global_loop<named_dim::z, named_dim::y>;
 
-using hip_global_xyz_loop = hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_global_xzy_loop = hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_global_yxz_loop = hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_global_yzx_loop = hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_global_zxy_loop = hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_global_xyz_loop =
+    hip_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_global_xzy_loop =
+    hip_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_global_yxz_loop =
+    hip_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_global_yzx_loop =
+    hip_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_global_zxy_loop =
+    hip_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_global_zyx_loop =
+    hip_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -1806,54 +1992,83 @@ using hip_global_zyx_loop = hip_global_loop<named_dim::z, named_dim::y, named_di
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < named_dim ... dims >
-using hip_flatten_global_direct = hip_flatten_indexer_direct<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using hip_flatten_global_direct =
+    hip_flatten_indexer_direct<hip::IndexGlobal<dims,
+                                                named_usage::unspecified,
+                                                named_usage::unspecified>...>;
 
 using hip_flatten_global_x_direct = hip_flatten_global_direct<named_dim::x>;
 using hip_flatten_global_y_direct = hip_flatten_global_direct<named_dim::y>;
 using hip_flatten_global_z_direct = hip_flatten_global_direct<named_dim::z>;
 
-using hip_flatten_global_xy_direct = hip_flatten_global_direct<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_direct = hip_flatten_global_direct<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_direct = hip_flatten_global_direct<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_direct = hip_flatten_global_direct<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_direct = hip_flatten_global_direct<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_direct = hip_flatten_global_direct<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_direct = hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_direct = hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_direct = hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_direct = hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_direct = hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_direct = hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_direct =
+    hip_flatten_global_direct<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_direct =
+    hip_flatten_global_direct<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_direct =
+    hip_flatten_global_direct<named_dim::z, named_dim::y, named_dim::x>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < named_dim ... dims >
-using hip_flatten_global_loop = hip_flatten_indexer_loop<
-    hip::IndexGlobal<dims, named_usage::unspecified, named_usage::unspecified>...>;
+template<named_dim... dims>
+using hip_flatten_global_loop =
+    hip_flatten_indexer_loop<hip::IndexGlobal<dims,
+                                              named_usage::unspecified,
+                                              named_usage::unspecified>...>;
 
 using hip_flatten_global_x_loop = hip_flatten_global_loop<named_dim::x>;
 using hip_flatten_global_y_loop = hip_flatten_global_loop<named_dim::y>;
 using hip_flatten_global_z_loop = hip_flatten_global_loop<named_dim::z>;
 
-using hip_flatten_global_xy_loop = hip_flatten_global_loop<named_dim::x, named_dim::y>;
-using hip_flatten_global_xz_loop = hip_flatten_global_loop<named_dim::x, named_dim::z>;
-using hip_flatten_global_yx_loop = hip_flatten_global_loop<named_dim::y, named_dim::x>;
-using hip_flatten_global_yz_loop = hip_flatten_global_loop<named_dim::y, named_dim::z>;
-using hip_flatten_global_zx_loop = hip_flatten_global_loop<named_dim::z, named_dim::x>;
-using hip_flatten_global_zy_loop = hip_flatten_global_loop<named_dim::z, named_dim::y>;
-
-using hip_flatten_global_xyz_loop = hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
-using hip_flatten_global_xzy_loop = hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
-using hip_flatten_global_yxz_loop = hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
-using hip_flatten_global_yzx_loop = hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
-using hip_flatten_global_zxy_loop = hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
-using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
+using hip_flatten_global_xy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y>;
+using hip_flatten_global_xz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z>;
+using hip_flatten_global_yx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x>;
+using hip_flatten_global_yz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z>;
+using hip_flatten_global_zx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x>;
+using hip_flatten_global_zy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y>;
+
+using hip_flatten_global_xyz_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::y, named_dim::z>;
+using hip_flatten_global_xzy_loop =
+    hip_flatten_global_loop<named_dim::x, named_dim::z, named_dim::y>;
+using hip_flatten_global_yxz_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::x, named_dim::z>;
+using hip_flatten_global_yzx_loop =
+    hip_flatten_global_loop<named_dim::y, named_dim::z, named_dim::x>;
+using hip_flatten_global_zxy_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::x, named_dim::y>;
+using hip_flatten_global_zyx_loop =
+    hip_flatten_global_loop<named_dim::z, named_dim::y, named_dim::x>;
 
 
 /*!
@@ -1861,271 +2076,460 @@ using hip_flatten_global_zyx_loop = hip_flatten_global_loop<named_dim::z, named_
  * This is the lowest overhead mapping, but requires that there are enough
  * physical threads to fit all of the direct map requests.
  */
-template < int X_BLOCK_SIZE >
-using hip_thread_size_x_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_thread_size_y_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_thread_size_z_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_direct = hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_direct = hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_direct = hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template<int X_BLOCK_SIZE>
+using hip_thread_size_x_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using hip_thread_size_y_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using hip_thread_size_z_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_direct =
+    hip_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_direct =
+    hip_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_direct =
+    hip_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                       hip::thread_y<Y_BLOCK_SIZE>,
+                       hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
 using hip_block_size_x_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template<int Y_GRID_SIZE>
 using hip_block_size_y_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template<int Z_GRID_SIZE>
 using hip_block_size_z_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_direct = hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_direct = hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_direct = hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_direct =
+    hip_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_direct =
+    hip_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_direct =
+    hip_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_direct = hip_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_direct = hip_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_direct = hip_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                                                     hip::block_y<Y_GRID_SIZE>,
+                                                     hip::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_direct =
+    hip_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_direct =
+    hip_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_direct =
+    hip_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                       hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                       hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*!
  * Maps segment indices to HIP global threads.
  * Uses grid-stride looping to exceed the maximum number of global threads
  */
-template < int X_BLOCK_SIZE >
+template<int X_BLOCK_SIZE>
 using hip_thread_size_x_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
+template<int Y_BLOCK_SIZE>
 using hip_thread_size_y_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
+template<int Z_BLOCK_SIZE>
 using hip_thread_size_z_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
 
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xy_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xz_loop =
+    hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yx_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yz_loop =
+    hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zx_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zy_loop =
+    hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_xyz_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_xzy_loop = hip_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_thread_size_yxz_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_yzx_loop = hip_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_thread_size_zxy_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_thread_size_zyx_loop = hip_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                                                  hip::thread_y<Y_BLOCK_SIZE>,
+                                                  hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
 using hip_block_size_x_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
+template<int Y_GRID_SIZE>
 using hip_block_size_y_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
+template<int Z_GRID_SIZE>
 using hip_block_size_z_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
 
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_x_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_y_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_z_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xyz_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_xzy_loop = hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yxz_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_yzx_loop = hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zxy_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xy_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xz_loop =
+    hip_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yx_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yz_loop =
+    hip_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zx_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zy_loop =
+    hip_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_xyz_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_xzy_loop = hip_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_block_size_yxz_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_yzx_loop = hip_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_block_size_zxy_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_block_size_zyx_loop = hip_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                                                 hip::block_y<Y_GRID_SIZE>,
+                                                 hip::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_x_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_y_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_z_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xyz_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_xzy_loop =
+    hip_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yxz_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_yzx_loop =
+    hip_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zxy_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_global_size_zyx_loop =
+    hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
@@ -2133,272 +2537,507 @@ using hip_global_size_zyx_loop = hip_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_
  * physical global threads to fit all of the direct map requests.
  * Reshapes multiple physical global threads into a 1D iteration space
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_direct = hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_direct = hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_direct = hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_direct = hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_direct = hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_direct = hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                     hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                     hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_direct = hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_direct = hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_direct = hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                      hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                      hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::thread_z<Z_BLOCK_SIZE>,
+                               hip::thread_y<Y_BLOCK_SIZE>,
+                               hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
+using hip_flatten_block_size_x_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE>
+using hip_flatten_block_size_y_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE>
+using hip_flatten_block_size_z_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::block_y<Y_GRID_SIZE>,
+                               hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::block_z<Z_GRID_SIZE>,
+                               hip::block_y<Y_GRID_SIZE>,
+                               hip::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_direct =
+    hip_flatten_indexer_direct<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_direct =
+    hip_flatten_indexer_direct<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_direct =
+    hip_flatten_indexer_direct<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                               hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                               hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 /*
  * Maps segment indices to flattened HIP global threads.
  * Reshapes multiple physical global threads into a 1D iteration space
- * Uses global thread-stride looping to exceed the maximum number of physical global threads
+ * Uses global thread-stride looping to exceed the maximum number of physical
+ * global threads
  */
-template < int X_BLOCK_SIZE >
-using hip_flatten_thread_size_x_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_y_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_z_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_xyz_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_xzy_loop = hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE >
-using hip_flatten_thread_size_yxz_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_yzx_loop = hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>, hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE >
-using hip_flatten_thread_size_zxy_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE >
-using hip_flatten_thread_size_zyx_loop = hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>, hip::thread_y<Y_BLOCK_SIZE>, hip::thread_x<X_BLOCK_SIZE>>;
-
-
-template < int X_GRID_SIZE >
-using hip_flatten_block_size_x_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE >
-using hip_flatten_block_size_y_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE >
-using hip_flatten_block_size_z_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-
-template < int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_xyz_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_xzy_loop = hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE >
-using hip_flatten_block_size_yxz_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>>;
-template < int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_yzx_loop = hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>, hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE >
-using hip_flatten_block_size_zxy_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_x<X_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>>;
-template < int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE >
-using hip_flatten_block_size_zyx_loop = hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>, hip::block_y<Y_GRID_SIZE>, hip::block_x<X_GRID_SIZE>>;
-
-
-template < int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_x_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_y_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_z_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                 hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                 hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-
-template < int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xyz_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_xzy_loop = hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yxz_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
-template < int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Y_GRID_SIZE = named_usage::unspecified, int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_yzx_loop = hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zxy_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
-template < int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE,
-           int Z_GRID_SIZE = named_usage::unspecified, int Y_GRID_SIZE = named_usage::unspecified, int X_GRID_SIZE = named_usage::unspecified >
-using hip_flatten_global_size_zyx_loop = hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
-                                                                  hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
-                                                                  hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int X_BLOCK_SIZE>
+using hip_flatten_thread_size_x_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_y_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_z_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+
+template<int X_BLOCK_SIZE, int Y_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template<int X_BLOCK_SIZE, int Z_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int X_BLOCK_SIZE, int Z_BLOCK_SIZE>
+using hip_flatten_thread_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>>;
+template<int Y_BLOCK_SIZE, int Z_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int X_BLOCK_SIZE, int Y_BLOCK_SIZE>
+using hip_flatten_thread_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>>;
+template<int Z_BLOCK_SIZE, int Y_BLOCK_SIZE, int X_BLOCK_SIZE>
+using hip_flatten_thread_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::thread_z<Z_BLOCK_SIZE>,
+                             hip::thread_y<Y_BLOCK_SIZE>,
+                             hip::thread_x<X_BLOCK_SIZE>>;
+
+
+template<int X_GRID_SIZE>
+using hip_flatten_block_size_x_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE>
+using hip_flatten_block_size_y_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE>
+using hip_flatten_block_size_z_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+
+template<int X_GRID_SIZE, int Y_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template<int X_GRID_SIZE, int Z_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int X_GRID_SIZE, int Z_GRID_SIZE>
+using hip_flatten_block_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>>;
+template<int Y_GRID_SIZE, int Z_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::block_y<Y_GRID_SIZE>,
+                             hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int X_GRID_SIZE, int Y_GRID_SIZE>
+using hip_flatten_block_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>>;
+template<int Z_GRID_SIZE, int Y_GRID_SIZE, int X_GRID_SIZE>
+using hip_flatten_block_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::block_z<Z_GRID_SIZE>,
+                             hip::block_y<Y_GRID_SIZE>,
+                             hip::block_x<X_GRID_SIZE>>;
+
+
+template<int X_BLOCK_SIZE, int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_x_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE, int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_y_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE, int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_z_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+
+template<int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xyz_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_xzy_loop =
+    hip_flatten_indexer_loop<hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yxz_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>>;
+template<int Y_BLOCK_SIZE,
+         int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_yzx_loop =
+    hip_flatten_indexer_loop<hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zxy_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>>;
+template<int Z_BLOCK_SIZE,
+         int Y_BLOCK_SIZE,
+         int X_BLOCK_SIZE,
+         int Z_GRID_SIZE = named_usage::unspecified,
+         int Y_GRID_SIZE = named_usage::unspecified,
+         int X_GRID_SIZE = named_usage::unspecified>
+using hip_flatten_global_size_zyx_loop =
+    hip_flatten_indexer_loop<hip::global_z<Z_BLOCK_SIZE, Z_GRID_SIZE>,
+                             hip::global_y<Y_BLOCK_SIZE, Y_GRID_SIZE>,
+                             hip::global_x<X_BLOCK_SIZE, X_GRID_SIZE>>;
 
 
 /*
diff --git a/include/RAJA/policy/hip/raja_hiperrchk.hpp b/include/RAJA/policy/hip/raja_hiperrchk.hpp
index 5e3a02fb2c..da7b6738f3 100644
--- a/include/RAJA/policy/hip/raja_hiperrchk.hpp
+++ b/include/RAJA/policy/hip/raja_hiperrchk.hpp
@@ -42,18 +42,20 @@ namespace RAJA
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-#define hipErrchk(ans)                            \
-  {                                                \
-    ::RAJA::hipAssert((ans), __FILE__, __LINE__); \
+#define hipErrchk(ans)                                                         \
+  {                                                                            \
+    ::RAJA::hipAssert((ans), __FILE__, __LINE__);                              \
   }
 
 inline void hipAssert(hipError_t code,
-                       const char *file,
-                       int line,
-                       bool abort = true)
+                      const char* file,
+                      int line,
+                      bool abort = true)
 {
-  if (code != hipSuccess) {
-    if (abort) {
+  if (code != hipSuccess)
+  {
+    if (abort)
+    {
       std::string msg;
       msg += "HIPassert: ";
       msg += hipGetErrorString(code);
@@ -62,9 +64,11 @@ inline void hipAssert(hipError_t code,
       msg += ":";
       msg += std::to_string(line);
       throw std::runtime_error(msg);
-    } else {
-      fprintf(stderr, "HIPassert: %s %s %d\n",
-              hipGetErrorString(code), file, line);
+    }
+    else
+    {
+      fprintf(stderr, "HIPassert: %s %s %d\n", hipGetErrorString(code), file,
+              line);
     }
   }
 }
diff --git a/include/RAJA/policy/hip/reduce.hpp b/include/RAJA/policy/hip/reduce.hpp
index c81adf8e24..30d0dc10b2 100644
--- a/include/RAJA/policy/hip/reduce.hpp
+++ b/include/RAJA/policy/hip/reduce.hpp
@@ -56,51 +56,57 @@ namespace hip
 {
 
 //! atomic operator version of Combiner object
-template <typename Combiner>
+template<typename Combiner>
 struct atomic;
 
-template <typename T>
-struct atomic<sum<T>> {
+template<typename T>
+struct atomic<sum<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAdd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAdd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<min<T>> {
+template<typename T>
+struct atomic<min<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMin(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMin(RAJA::hip_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<max<T>> {
+template<typename T>
+struct atomic<max<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicMax(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicMax(RAJA::hip_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<and_bit<T>> {
+template<typename T>
+struct atomic<and_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicAnd(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicAnd(RAJA::hip_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct atomic<or_bit<T>> {
+template<typename T>
+struct atomic<or_bit<T>>
+{
   RAJA_DEVICE RAJA_INLINE void operator()(T& val, const T v)
   {
-    RAJA::atomicOr(RAJA::hip_atomic{}, &val, v);
+    RAJA::atomicOr(RAJA::hip_atomic {}, &val, v);
   }
 };
 
-template <typename T>
-struct hip_atomic_available {
+template<typename T>
+struct hip_atomic_available
+{
   static constexpr const bool value =
       (std::is_integral<T>::value && (4 == sizeof(T) || 8 == sizeof(T))) ||
       std::is_same<T, float>::value || std::is_same<T, double>::value;
@@ -118,15 +124,19 @@ namespace impl
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T, typename TempIterator>
+template<typename Combiner,
+         typename Accessor,
+         int replication,
+         int atomic_stride,
+         typename T,
+         typename TempIterator>
 RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
-                                        T identity,
-                                        TempIterator in_device_mem,
-                                        unsigned int* device_count)
+                                                   T identity,
+                                                   TempIterator in_device_mem,
+                                                   unsigned int* device_count)
 {
-  typename TempIterator::template rebind_accessor<Accessor> device_mem(in_device_mem);
+  typename TempIterator::template rebind_accessor<Accessor> device_mem(
+      in_device_mem);
 
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -137,20 +147,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
   int replicationId = blockId % replication;
-  int slotId = blockId / replication;
+  int slotId        = blockId / replication;
 
-  int maxNumSlots = (numBlocks + replication - 1) / replication;
+  int maxNumSlots       = (numBlocks + replication - 1) / replication;
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
   int atomicOffset = replicationId * atomic_stride;
-  int beginSlots = replicationId * maxNumSlots;
-  int blockSlot = beginSlots + slotId;
+  int beginSlots   = replicationId * maxNumSlots;
+  int blockSlot    = beginSlots + slotId;
 
   T temp = block_reduce<Combiner>(val, identity);
 
-  if (numSlots <= 1u) {
-    if (threadId == 0) {
+  if (numSlots <= 1u)
+  {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
@@ -158,33 +170,36 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
 
   // one thread per block writes to device_mem
   __shared__ bool isLastBlock;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockSlot, temp);
     // ensure write visible to all threadblocks
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots-1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots-1));
-    isLastBlock = (old_count == (numSlots-1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots - 1));
+    isLastBlock = (old_count == (numSlots - 1));
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   __syncthreads();
 
   // last block accumulates values from device_mem
-  if (isLastBlock) {
+  if (isLastBlock)
+  {
     temp = identity;
     Accessor::fence_acquire();
 
-    for (unsigned int i = threadId;
-                      i < numSlots;
-                      i += numThreads) {
-      Combiner{}(temp, device_mem.get(beginSlots + i));
+    for (unsigned int i = threadId; i < numSlots; i += numThreads)
+    {
+      Combiner {}(temp, device_mem.get(beginSlots + i));
     }
 
     temp = block_reduce<Combiner>(temp, identity);
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
   }
@@ -192,72 +207,91 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_last_block(T& val,
   return (isLastBlock && threadId == 0) ? replicationId : replication;
 }
 
-namespace expt {
+namespace expt
+{
 
-template <typename ThreadIterationGetter, typename Combiner, typename T>
+template<typename ThreadIterationGetter, typename Combiner, typename T>
 RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
 {
   const int numThreads = ThreadIterationGetter::size();
-  const int threadId = ThreadIterationGetter::index();
+  const int threadId   = ThreadIterationGetter::index();
 
-  const int warpId = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
+  const int warpId  = threadId % RAJA::policy::hip::device_constants.WARP_SIZE;
   const int warpNum = threadId / RAJA::policy::hip::device_constants.WARP_SIZE;
 
   T temp = val;
 
-  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0) {
+  if (numThreads % RAJA::policy::hip::device_constants.WARP_SIZE == 0)
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-      temp = Combiner{}(temp, rhs);
+      temp  = Combiner {}(temp, rhs);
     }
-
-  } else {
+  }
+  else
+  {
 
     // reduce each warp
-    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2) {
+    for (int i = 1; i < RAJA::policy::hip::device_constants.WARP_SIZE; i *= 2)
+    {
       int srcLane = threadId ^ i;
-      T rhs = RAJA::hip::impl::shfl_sync(temp, srcLane);
+      T rhs       = RAJA::hip::impl::shfl_sync(temp, srcLane);
       // only add from threads that exist (don't double count own value)
-      if (srcLane < numThreads) {
-        temp = Combiner{}(temp, rhs);
+      if (srcLane < numThreads)
+      {
+        temp = Combiner {}(temp, rhs);
       }
     }
   }
 
-  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <= RAJA::policy::hip::device_constants.WARP_SIZE,
-               "Max Warps must be less than or equal to Warp Size for this algorithm to work");
+  static_assert(RAJA::policy::hip::device_constants.MAX_WARPS <=
+                    RAJA::policy::hip::device_constants.WARP_SIZE,
+                "Max Warps must be less than or equal to Warp Size for this "
+                "algorithm to work");
 
   // reduce per warp values
-  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE) {
+  if (numThreads > RAJA::policy::hip::device_constants.WARP_SIZE)
+  {
 
     // Need to separate declaration and initialization for clang-hip
-    __shared__ unsigned char tmpsd[sizeof(RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>)];
+    __shared__ unsigned char tmpsd[sizeof(
+        RAJA::detail::SoAArray<T,
+                               RAJA::policy::hip::device_constants.MAX_WARPS>)];
 
     // Partial placement new: Should call new(tmpsd) here but recasting memory
     // to avoid calling constructor/destructor in shared memory.
-    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> * sd = reinterpret_cast<RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS> *>(tmpsd);
+    RAJA::detail::SoAArray<T, RAJA::policy::hip::device_constants.MAX_WARPS>*
+        sd = reinterpret_cast<RAJA::detail::SoAArray<
+            T, RAJA::policy::hip::device_constants.MAX_WARPS>*>(tmpsd);
 
     // write per warp values to shared memory
-    if (warpId == 0) {
+    if (warpId == 0)
+    {
       sd->set(warpNum, temp);
     }
 
     __syncthreads();
 
-    if (warpNum == 0) {
+    if (warpNum == 0)
+    {
 
       // read per warp values
-      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads) {
+      if (warpId * RAJA::policy::hip::device_constants.WARP_SIZE < numThreads)
+      {
         temp = sd->get(warpId);
-      } else {
+      }
+      else
+      {
         temp = identity;
       }
 
-      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2) {
+      for (int i = 1; i < RAJA::policy::hip::device_constants.MAX_WARPS; i *= 2)
+      {
         T rhs = RAJA::hip::impl::shfl_xor_sync(temp, i);
-        temp = Combiner{}(temp, rhs);
+        temp  = Combiner {}(temp, rhs);
       }
     }
 
@@ -267,69 +301,77 @@ RAJA_DEVICE RAJA_INLINE T block_reduce(T val, T identity)
   return temp;
 }
 
-
-template <typename GlobalIterationGetter, typename OP, typename T>
-RAJA_DEVICE RAJA_INLINE void grid_reduce( T * device_target,
-                                          T val,
-                                          RAJA::detail::SoAPtr<T,RAJA::hip::device_mempool_type> device_mem,
-                                          unsigned int * device_count)
+template<typename GlobalIterationGetter, typename OP, typename T>
+RAJA_DEVICE RAJA_INLINE void grid_reduce(
+    T* device_target,
+    T val,
+    RAJA::detail::SoAPtr<T, RAJA::hip::device_mempool_type> device_mem,
+    unsigned int* device_count)
 {
-  using BlockIterationGetter = typename get_index_block<GlobalIterationGetter>::type;
-  using ThreadIterationGetter = typename get_index_thread<GlobalIterationGetter>::type;
+  using BlockIterationGetter =
+      typename get_index_block<GlobalIterationGetter>::type;
+  using ThreadIterationGetter =
+      typename get_index_thread<GlobalIterationGetter>::type;
 
-  const int numBlocks = BlockIterationGetter::size();
-  const int numThreads = ThreadIterationGetter::size();
+  const int numBlocks            = BlockIterationGetter::size();
+  const int numThreads           = ThreadIterationGetter::size();
   const unsigned int wrap_around = numBlocks - 1;
 
-  const int blockId = BlockIterationGetter::index();
+  const int blockId  = BlockIterationGetter::index();
   const int threadId = ThreadIterationGetter::index();
 
   T temp = block_reduce<ThreadIterationGetter, OP>(val, OP::identity());
 
   // one thread per block writes to device_mem
   bool lastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     device_mem.set(blockId, temp);
     // ensure write visible to all threadblocks
     __threadfence();
     // increment counter, (wraps back to zero if old count == wrap_around)
     unsigned int old_count = ::atomicInc(device_count, wrap_around);
-    lastBlock = (old_count == wrap_around);
+    lastBlock              = (old_count == wrap_around);
   }
 
   // returns non-zero value if any thread passes in a non-zero value
   lastBlock = __syncthreads_or(lastBlock);
 
   // last block accumulates values from device_mem
-  if (lastBlock) {
+  if (lastBlock)
+  {
     temp = OP::identity();
     __threadfence();
 
-    for (int i = threadId; i < numBlocks; i += numThreads) {
-      temp = OP{}(temp, device_mem.get(i));
+    for (int i = threadId; i < numBlocks; i += numThreads)
+    {
+      temp = OP {}(temp, device_mem.get(i));
     }
 
     temp = block_reduce<ThreadIterationGetter, OP>(temp, OP::identity());
 
     // one thread returns value
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       *device_target = temp;
     }
   }
 }
 
-} //  namespace expt
-
+}  //  namespace expt
 
 //! reduce values in grid into thread 0 of last running block
 //  returns true if put reduced value in val
-template <typename Combiner, typename Accessor,
-          int replication, int atomic_stride,
-          typename T>
-RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
-                                               T identity,
-                                               T* device_mem,
-                                               unsigned int* device_count)
+template<typename Combiner,
+         typename Accessor,
+         int replication,
+         int atomic_stride,
+         typename T>
+RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(
+    T& val,
+    T identity,
+    T* device_mem,
+    unsigned int* device_count)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -338,24 +380,28 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
                 (gridDim.x * gridDim.y) * blockIdx.z;
   int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   unsigned int numSlots = (numBlocks / replication) +
-      ((replicationId < (numBlocks % replication)) ? 1 : 0);
+                          ((replicationId < (numBlocks % replication)) ? 1 : 0);
 
-  if (numSlots <= 1u) {
+  if (numSlots <= 1u)
+  {
     T temp = block_reduce<Combiner>(val, identity);
-    if (threadId == 0) {
+    if (threadId == 0)
+    {
       val = temp;
     }
     return (threadId == 0) ? replicationId : replication;
   }
 
   // the first block of each replication initializes device_mem
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     unsigned int old_val = ::atomicCAS(&device_count[atomicOffset], 0u, 1u);
-    if (old_val == 0u) {
+    if (old_val == 0u)
+    {
       Accessor::set(device_mem, atomicOffset, identity);
       Accessor::fence_release();
       ::atomicAdd(&device_count[atomicOffset], 1u);
@@ -366,19 +412,22 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 
   // one thread per block performs an atomic on device_mem
   bool isLastBlock = false;
-  if (threadId == 0) {
+  if (threadId == 0)
+  {
     // wait for device_mem to be initialized
     while (::atomicAdd(&device_count[atomicOffset], 0u) < 2u)
       ;
     Accessor::fence_acquire();
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
     Accessor::fence_release();
     // increment counter, (wraps back to zero if old count == (numSlots+1))
-    unsigned int old_count = ::atomicInc(&device_count[atomicOffset], (numSlots+1));
-    isLastBlock = (old_count == (numSlots+1));
+    unsigned int old_count =
+        ::atomicInc(&device_count[atomicOffset], (numSlots + 1));
+    isLastBlock = (old_count == (numSlots + 1));
 
     // the last block for each replication gets the value from device_mem
-    if (isLastBlock) {
+    if (isLastBlock)
+    {
       Accessor::fence_acquire();
       val = Accessor::get(device_mem, atomicOffset);
     }
@@ -388,10 +437,10 @@ RAJA_DEVICE RAJA_INLINE int grid_reduce_atomic_device_init(T& val,
 }
 
 //! reduce values in block into thread 0 and atomically combines into device_mem
-template <typename Combiner, int replication, int atomic_stride, typename T>
+template<typename Combiner, int replication, int atomic_stride, typename T>
 RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
-                                                            T identity,
-                                                            T* device_mem)
+                                                          T identity,
+                                                          T* device_mem)
 {
   int threadId = threadIdx.x + blockDim.x * threadIdx.y +
                  (blockDim.x * blockDim.y) * threadIdx.z;
@@ -399,33 +448,36 @@ RAJA_DEVICE RAJA_INLINE void grid_reduce_atomic_host_init(T& val,
   int blockId = blockIdx.x + gridDim.x * blockIdx.y +
                 (gridDim.x * gridDim.y) * blockIdx.z;
 
-  int replicationId = (blockId%replication);
-  int atomicOffset = replicationId*atomic_stride;
+  int replicationId = (blockId % replication);
+  int atomicOffset  = replicationId * atomic_stride;
 
   T temp = block_reduce<Combiner>(val, identity);
 
   // one thread per block performs an atomic on device_mem
-  if (threadId == 0 && temp != identity) {
-    RAJA::reduce::hip::atomic<Combiner>{}(device_mem[atomicOffset], temp);
+  if (threadId == 0 && temp != identity)
+  {
+    RAJA::reduce::hip::atomic<Combiner> {}(device_mem[atomicOffset], temp);
   }
-
 }
 
 }  // namespace impl
 
 //! Object that manages pinned memory buffers for reduction results
 //  use one per reducer object
-template <typename T, size_t num_slots, typename mempool>
+template<typename T, size_t num_slots, typename mempool>
 class PinnedTally
 {
 public:
   //! Object put in Pinned memory with value and pointer to next Node
-  struct Node {
+  struct Node
+  {
     Node* next;
     T values[num_slots];
   };
+
   //! Object per resource to keep track of pinned memory nodes
-  struct ResourceNode {
+  struct ResourceNode
+  {
     ResourceNode* next;
     ::RAJA::resources::Hip res;
     Node* node_list;
@@ -478,14 +530,19 @@ class PinnedTally
 
     const ResourceNodeIterator& operator++()
     {
-      if (m_n->next) {
+      if (m_n->next)
+      {
         m_n = m_n->next;
-      } else if (m_rn->next) {
+      }
+      else if (m_rn->next)
+      {
         m_rn = m_rn->next;
-        m_n = m_rn->node_list;
-      } else {
+        m_n  = m_rn->node_list;
+      }
+      else
+      {
         m_rn = nullptr;
-        m_n = nullptr;
+        m_n  = nullptr;
       }
       return *this;
     }
@@ -497,7 +554,7 @@ class PinnedTally
       return ret;
     }
 
-    auto operator*() -> T(&)[num_slots] { return m_n->values; }
+    auto operator*() -> T (&)[num_slots] { return m_n->values; }
 
     bool operator==(const ResourceNodeIterator& rhs) const
     {
@@ -534,25 +591,27 @@ class PinnedTally
   ResourceNodeIterator end() { return {nullptr, nullptr}; }
 
   //! get new value for use in resource
-  auto new_value(::RAJA::resources::Hip res) -> T(&)[num_slots]
+  auto new_value(::RAJA::resources::Hip res) -> T (&)[num_slots]
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
     ResourceNode* rn = resource_list;
-    while (rn) {
+    while (rn)
+    {
       if (rn->res.get_stream() == res.get_stream()) break;
       rn = rn->next;
     }
-    if (!rn) {
-      rn = (ResourceNode*)malloc(sizeof(ResourceNode));
-      rn->next = resource_list;
-      rn->res = res;
+    if (!rn)
+    {
+      rn            = (ResourceNode*)malloc(sizeof(ResourceNode));
+      rn->next      = resource_list;
+      rn->res       = res;
       rn->node_list = nullptr;
       resource_list = rn;
     }
-    Node* n = mempool::getInstance().template malloc<Node>(1);
-    n->next = rn->node_list;
+    Node* n       = mempool::getInstance().template malloc<Node>(1);
+    n->next       = rn->node_list;
     rn->node_list = n;
     return n->values;
   }
@@ -561,7 +620,8 @@ class PinnedTally
   void synchronize_resources()
   {
     auto end = resourceEnd();
-    for (auto r = resourceBegin(); r != end; ++r) {
+    for (auto r = resourceBegin(); r != end; ++r)
+    {
       ::RAJA::hip::synchronize(*r);
     }
   }
@@ -569,10 +629,12 @@ class PinnedTally
   //! all values used in all resources
   void free_list()
   {
-    while (resource_list) {
+    while (resource_list)
+    {
       ResourceNode* rn = resource_list;
-      while (rn->node_list) {
-        Node* n = rn->node_list;
+      while (rn->node_list)
+      {
+        Node* n       = rn->node_list;
         rn->node_list = n->next;
         mempool::getInstance().free(n);
       }
@@ -601,12 +663,15 @@ class PinnedTally
 
 //! Reduction data for Hip Offload -- stores value, host pointer, and device
 //! pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename Accessor,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceLastBlock_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -617,7 +682,7 @@ struct ReduceLastBlock_Data
   RAJA::detail::SoAPtr<T, data_mempool_type> device;
   bool own_device_ptr;
 
-  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()){};
+  ReduceLastBlock_Data() : ReduceLastBlock_Data(T(), T()) {};
 
   /*! \brief create from a default value and offload information
    *
@@ -625,31 +690,30 @@ struct ReduceLastBlock_Data
    */
 
   ReduceLastBlock_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceLastBlock_Data(const ReduceLastBlock_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
   ReduceLastBlock_Data& operator=(const ReduceLastBlock_Data&) = default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -660,10 +724,12 @@ struct ReduceLastBlock_Data
   void grid_reduce(T* output)
   {
     T temp = value;
-    size_t replicationId = impl::grid_reduce_last_block<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_last_block<Combiner, Accessor, replication,
+                                     atomic_stride>(temp, identity, device,
+                                                    device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -673,13 +739,15 @@ struct ReduceLastBlock_Data
   bool setupForDevice()
   {
     bool act = !device.allocated() && setupReducers();
-    if (act) {
-      hip_dim_t gridDim = currentGridDim();
-      size_t numBlocks = gridDim.x * gridDim.y * gridDim.z;
+    if (act)
+    {
+      hip_dim_t gridDim  = currentGridDim();
+      size_t numBlocks   = gridDim.x * gridDim.y * gridDim.z;
       size_t maxNumSlots = (numBlocks + replication - 1) / replication;
-      device.allocate(maxNumSlots*replication);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+      device.allocate(maxNumSlots * replication);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -690,20 +758,22 @@ struct ReduceLastBlock_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       device.deallocate();
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
   }
 };
 
-
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceAtomicHostInit_Data
 {
   using tally_mempool_type = device_pinned_mempool_type;
@@ -715,32 +785,32 @@ struct ReduceAtomicHostInit_Data
   bool is_setup;
   bool own_device_ptr;
 
-  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()){}
+  ReduceAtomicHostInit_Data() : ReduceAtomicHostInit_Data(T(), T()) {}
 
   ReduceAtomicHostInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        is_setup{false},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        is_setup {false},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicHostInit_Data(const ReduceAtomicHostInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        is_setup{other.is_setup},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        is_setup {other.is_setup},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) = default;
+  ReduceAtomicHostInit_Data& operator=(const ReduceAtomicHostInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -753,7 +823,7 @@ struct ReduceAtomicHostInit_Data
     T temp = value;
 
     impl::grid_reduce_atomic_host_init<Combiner, replication, atomic_stride>(
-            temp, identity, output);
+        temp, identity, output);
   }
 
   //! check and setup for device
@@ -761,8 +831,9 @@ struct ReduceAtomicHostInit_Data
   bool setupForDevice()
   {
     bool act = !is_setup && setupReducers();
-    if (act) {
-      is_setup = true;
+    if (act)
+    {
+      is_setup       = true;
       own_device_ptr = true;
     }
     return act;
@@ -773,8 +844,9 @@ struct ReduceAtomicHostInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
-      is_setup = false;
+    if (act)
+    {
+      is_setup       = false;
       own_device_ptr = false;
     }
     return act;
@@ -782,12 +854,15 @@ struct ReduceAtomicHostInit_Data
 };
 
 //! Reduction data for Hip Offload -- stores value, host pointer
-template <typename Combiner, typename Accessor, typename T,
-          size_t replication, size_t atomic_stride>
+template<typename Combiner,
+         typename Accessor,
+         typename T,
+         size_t replication,
+         size_t atomic_stride>
 struct ReduceAtomicDeviceInit_Data
 {
   using tally_mempool_type = pinned_mempool_type;
-  using data_mempool_type = device_mempool_type;
+  using data_mempool_type  = device_mempool_type;
   using count_mempool_type = device_zeroed_mempool_type;
 
   static constexpr size_t tally_slots = replication;
@@ -798,34 +873,34 @@ struct ReduceAtomicDeviceInit_Data
   T* device;
   bool own_device_ptr;
 
-  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()){}
+  ReduceAtomicDeviceInit_Data() : ReduceAtomicDeviceInit_Data(T(), T()) {}
 
   ReduceAtomicDeviceInit_Data(T initValue, T identity_)
-      : value{initValue},
-        identity{identity_},
-        device_count{nullptr},
-        device{nullptr},
-        own_device_ptr{false}
-  {
-  }
+      : value {initValue},
+        identity {identity_},
+        device_count {nullptr},
+        device {nullptr},
+        own_device_ptr {false}
+  {}
 
   RAJA_HOST_DEVICE
   ReduceAtomicDeviceInit_Data(const ReduceAtomicDeviceInit_Data& other)
-      : value{other.identity},
-        identity{other.identity},
-        device_count{other.device_count},
-        device{other.device},
-        own_device_ptr{false}
-  {
-  }
+      : value {other.identity},
+        identity {other.identity},
+        device_count {other.device_count},
+        device {other.device},
+        own_device_ptr {false}
+  {}
 
-  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) = default;
+  ReduceAtomicDeviceInit_Data& operator=(const ReduceAtomicDeviceInit_Data&) =
+      default;
 
   //! initialize output to identity to ensure never read
   //  uninitialized memory
-  T* init_grid_vals(T(&output)[tally_slots])
+  T* init_grid_vals(T (&output)[tally_slots])
   {
-    for (size_t r = 0; r < tally_slots; ++r) {
+    for (size_t r = 0; r < tally_slots; ++r)
+    {
       output[r] = identity;
     }
     return &output[0];
@@ -837,10 +912,12 @@ struct ReduceAtomicDeviceInit_Data
   {
     T temp = value;
 
-    size_t replicationId = impl::grid_reduce_atomic_device_init<
-        Combiner, Accessor, replication, atomic_stride>(
-          temp, identity, device, device_count);
-    if (replicationId != replication) {
+    size_t replicationId =
+        impl::grid_reduce_atomic_device_init<Combiner, Accessor, replication,
+                                             atomic_stride>(
+            temp, identity, device, device_count);
+    if (replicationId != replication)
+    {
       output[replicationId] = temp;
     }
   }
@@ -850,10 +927,13 @@ struct ReduceAtomicDeviceInit_Data
   bool setupForDevice()
   {
     bool act = !device && setupReducers();
-    if (act) {
-      device = data_mempool_type::getInstance().template malloc<T>(replication*atomic_stride);
-      device_count = count_mempool_type::getInstance()
-                         .template malloc<unsigned int>(replication*atomic_stride);
+    if (act)
+    {
+      device = data_mempool_type::getInstance().template malloc<T>(
+          replication * atomic_stride);
+      device_count =
+          count_mempool_type::getInstance().template malloc<unsigned int>(
+              replication * atomic_stride);
       own_device_ptr = true;
     }
     return act;
@@ -864,65 +944,93 @@ struct ReduceAtomicDeviceInit_Data
   bool teardownForDevice()
   {
     bool act = own_device_ptr;
-    if (act) {
+    if (act)
+    {
       data_mempool_type::getInstance().free(device);
       device = nullptr;
       count_mempool_type::getInstance().free(device_count);
-      device_count = nullptr;
+      device_count   = nullptr;
       own_device_ptr = false;
     }
     return act;
   }
 };
 
-
 //! Hip Reduction entity -- generalize on reduction, and type
-template <typename Combiner, typename T, typename tuning>
+template<typename Combiner, typename T, typename tuning>
 class Reduce
 {
-  static constexpr size_t replication = (tuning::replication > 0)
-      ? tuning::replication
-      : 32;
-  static constexpr size_t atomic_stride = (tuning::atomic_stride > 0)
-      ? tuning::atomic_stride
-      : ((policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
-        ? RAJA_DIVIDE_CEILING_INT(policy::hip::device_constants.ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE, sizeof(T))
-        : 1);
-
-  using Accessor = std::conditional_t<(tuning::comm_mode == block_communication_mode::block_fence),
+  static constexpr size_t replication =
+      (tuning::replication > 0) ? tuning::replication : 32;
+  static constexpr size_t atomic_stride =
+      (tuning::atomic_stride > 0)
+          ? tuning::atomic_stride
+          : ((policy::hip::device_constants
+                  .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE > sizeof(T))
+                 ? RAJA_DIVIDE_CEILING_INT(
+                       policy::hip::device_constants
+                           .ATOMIC_DESTRUCTIVE_INTERFERENCE_SIZE,
+                       sizeof(T))
+                 : 1);
+
+  using Accessor = std::conditional_t<
+      (tuning::comm_mode == block_communication_mode::block_fence),
       impl::AccessorDeviceScopeUseBlockFence,
-      std::conditional_t<(tuning::comm_mode == block_communication_mode::device_fence),
-        impl::AccessorDeviceScopeUseDeviceFence,
-        void>>;
+      std::conditional_t<(tuning::comm_mode ==
+                          block_communication_mode::device_fence),
+                         impl::AccessorDeviceScopeUseDeviceFence,
+                         void>>;
 
   static constexpr bool atomic_policy =
-      (tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block) ||
+      (tuning::algorithm ==
+       reduce_algorithm::init_device_combine_atomic_block) ||
       (tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block);
-  static constexpr bool atomic_available = RAJA::reduce::hip::hip_atomic_available<T>::value;
+  static constexpr bool atomic_available =
+      RAJA::reduce::hip::hip_atomic_available<T>::value;
 
   //! hip reduction data storage class and folding algorithm
-  using reduce_data_type = std::conditional_t<(tuning::algorithm == reduce_algorithm::combine_last_block) ||
-                                              (atomic_policy && !atomic_available),
-      hip::ReduceLastBlock_Data<Combiner, Accessor, T, replication, atomic_stride>,
-      std::conditional_t<atomic_available,
-        std::conditional_t<(tuning::algorithm == reduce_algorithm::init_device_combine_atomic_block),
-          hip::ReduceAtomicDeviceInit_Data<Combiner, Accessor, T, replication, atomic_stride>,
-          std::conditional_t<(tuning::algorithm == reduce_algorithm::init_host_combine_atomic_block),
-            hip::ReduceAtomicHostInit_Data<Combiner, T, replication, atomic_stride>,
-            void>>,
-        void>>;
+  using reduce_data_type = std::conditional_t<
+      (tuning::algorithm == reduce_algorithm::combine_last_block) ||
+          (atomic_policy && !atomic_available),
+      hip::ReduceLastBlock_Data<Combiner,
+                                Accessor,
+                                T,
+                                replication,
+                                atomic_stride>,
+      std::conditional_t<
+          atomic_available,
+          std::conditional_t<
+              (tuning::algorithm ==
+               reduce_algorithm::init_device_combine_atomic_block),
+              hip::ReduceAtomicDeviceInit_Data<Combiner,
+                                               Accessor,
+                                               T,
+                                               replication,
+                                               atomic_stride>,
+              std::conditional_t<
+                  (tuning::algorithm ==
+                   reduce_algorithm::init_host_combine_atomic_block),
+                  hip::ReduceAtomicHostInit_Data<Combiner,
+                                                 T,
+                                                 replication,
+                                                 atomic_stride>,
+                  void>>,
+          void>>;
 
   static constexpr size_t tally_slots = reduce_data_type::tally_slots;
 
-  using TallyType = PinnedTally<T, tally_slots, typename reduce_data_type::tally_mempool_type>;
+  using TallyType = PinnedTally<T,
+                                tally_slots,
+                                typename reduce_data_type::tally_mempool_type>;
 
   //! union to hold either pointer to PinnedTally or pointer to value
   //  only use list before setup for device and only use val_ptr after
-  union tally_u {
+  union tally_u
+  {
     TallyType* list;
     T* val_ptr;
-    constexpr tally_u(TallyType* l) : list(l){};
-    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr){};
+    constexpr tally_u(TallyType* l) : list(l) {};
+    constexpr tally_u(T* v_ptr) : val_ptr(v_ptr) {};
   };
 
 public:
@@ -931,11 +1039,10 @@ class Reduce
   //! create a reduce object
   //  the original object's parent is itself
   explicit Reduce(T init_val, T identity_ = Combiner::identity())
-      : parent{this},
-        tally_or_val_ptr{new TallyType},
+      : parent {this},
+        tally_or_val_ptr {new TallyType},
         val(init_val, identity_)
-  {
-  }
+  {}
 
   void reset(T in_val, T identity_ = Combiner::identity())
   {
@@ -949,16 +1056,18 @@ class Reduce
   RAJA_HOST_DEVICE
   Reduce(const Reduce& other)
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-      : parent{other.parent},
+      : parent {other.parent},
 #else
-      : parent{&other},
+      : parent {&other},
 #endif
-        tally_or_val_ptr{other.tally_or_val_ptr},
+        tally_or_val_ptr {other.tally_or_val_ptr},
         val(other.val)
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent) {
-      if (val.setupForDevice()) {
+    if (parent)
+    {
+      if (val.setupForDevice())
+      {
         tally_or_val_ptr.val_ptr = val.init_grid_vals(
             tally_or_val_ptr.list->new_value(currentResource()));
         parent = nullptr;
@@ -973,25 +1082,35 @@ class Reduce
   ~Reduce()
   {
 #if !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
-    if (parent == this) {
+    if (parent == this)
+    {
       delete tally_or_val_ptr.list;
       tally_or_val_ptr.list = nullptr;
-    } else if (parent) {
-      if (val.value != val.identity) {
+    }
+    else if (parent)
+    {
+      if (val.value != val.identity)
+      {
 #if defined(RAJA_ENABLE_OPENMP)
         lock_guard<omp::mutex> lock(tally_or_val_ptr.list->m_mutex);
 #endif
         parent->combine(val.value);
       }
-    } else {
-      if (val.teardownForDevice()) {
+    }
+    else
+    {
+      if (val.teardownForDevice())
+      {
         tally_or_val_ptr.val_ptr = nullptr;
       }
     }
 #else
-    if (!parent->parent) {
+    if (!parent->parent)
+    {
       val.grid_reduce(tally_or_val_ptr.val_ptr);
-    } else {
+    }
+    else
+    {
       parent->combine(val.value);
     }
 #endif
@@ -1000,15 +1119,18 @@ class Reduce
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    auto n = tally_or_val_ptr.list->begin();
+    auto n   = tally_or_val_ptr.list->begin();
     auto end = tally_or_val_ptr.list->end();
-    if (n != end) {
+    if (n != end)
+    {
       tally_or_val_ptr.list->synchronize_resources();
       ::RAJA::detail::HighAccuracyReduce<T, typename Combiner::operator_type>
           reducer(std::move(val.value));
-      for (; n != end; ++n) {
+      for (; n != end; ++n)
+      {
         T(&values)[tally_slots] = *n;
-        for (size_t r = 0; r < tally_slots; ++r) {
+        for (size_t r = 0; r < tally_slots; ++r)
+        {
           reducer.combine(std::move(values[r]));
         }
       }
@@ -1017,12 +1139,13 @@ class Reduce
     }
     return val.value;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction (const version) -- still combines internal values
   RAJA_HOST_DEVICE
-  void combine(T other) const { Combiner{}(val.value, other); }
+  void combine(T other) const { Combiner {}(val.value, other); }
 
   /*!
    *  \return reference to the local value
@@ -1040,7 +1163,7 @@ class Reduce
 }  // end namespace hip
 
 //! specialization of ReduceSum for hip_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
     : public hip::Reduce<RAJA::reduce::sum<T>, T, tuning>
 {
@@ -1048,6 +1171,7 @@ class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 public:
   using Base = hip::Reduce<RAJA::reduce::sum<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator+= for ReduceSum -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceSum& operator+=(T rhs) const
@@ -1058,7 +1182,7 @@ class ReduceSum<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceBitOr for hip_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
     : public hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>
 {
@@ -1066,6 +1190,7 @@ class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 public:
   using Base = hip::Reduce<RAJA::reduce::or_bit<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator|= for ReduceBitOr -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceBitOr& operator|=(T rhs) const
@@ -1076,7 +1201,7 @@ class ReduceBitOr<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceBitAnd for hip_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
     : public hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>
 {
@@ -1084,6 +1209,7 @@ class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 public:
   using Base = hip::Reduce<RAJA::reduce::and_bit<T>, T, tuning>;
   using Base::Base;
+
   //! enable operator&= for ReduceBitAnd -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceBitAnd& operator&=(T rhs) const
@@ -1094,7 +1220,7 @@ class ReduceBitAnd<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMin for hip_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
     : public hip::Reduce<RAJA::reduce::min<T>, T, tuning>
 {
@@ -1102,6 +1228,7 @@ class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 public:
   using Base = hip::Reduce<RAJA::reduce::min<T>, T, tuning>;
   using Base::Base;
+
   //! enable min() for ReduceMin -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceMin& min(T rhs) const
@@ -1112,7 +1239,7 @@ class ReduceMin<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMax for hip_reduce
-template <typename tuning, typename T>
+template<typename tuning, typename T>
 class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
     : public hip::Reduce<RAJA::reduce::max<T>, T, tuning>
 {
@@ -1120,6 +1247,7 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 public:
   using Base = hip::Reduce<RAJA::reduce::max<T>, T, tuning>;
   using Base::Base;
+
   //! enable max() for ReduceMax -- alias for combine()
   RAJA_HOST_DEVICE
   const ReduceMax& max(T rhs) const
@@ -1130,35 +1258,41 @@ class ReduceMax<RAJA::policy::hip::hip_reduce_policy<tuning>, T>
 };
 
 //! specialization of ReduceMinLoc for hip_reduce
-template <typename tuning, typename T, typename IndexType>
+template<typename tuning, typename T, typename IndexType>
 class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::Reduce<RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
-                          RAJA::reduce::detail::ValueLoc<T, IndexType>,
-                          tuning>
+    : public hip::Reduce<
+          RAJA::reduce::min<RAJA::reduce::detail::ValueLoc<T, IndexType>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType>,
+          tuning>
 {
 
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType>;
-  using Combiner = RAJA::reduce::min<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType>;
+  using Combiner       = RAJA::reduce::min<value_type>;
   using NonLocCombiner = RAJA::reduce::min<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMinLoc(T init_val, IndexType init_idx,
+  ReduceMinLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
@@ -1180,35 +1314,41 @@ class ReduceMinLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
 };
 
 //! specialization of ReduceMaxLoc for hip_reduce
-template <typename tuning, typename T, typename IndexType>
+template<typename tuning, typename T, typename IndexType>
 class ReduceMaxLoc<RAJA::policy::hip::hip_reduce_policy<tuning>, T, IndexType>
-    : public hip::
-          Reduce<RAJA::reduce::max<RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
-                 RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
-                 tuning>
+    : public hip::Reduce<
+          RAJA::reduce::max<
+              RAJA::reduce::detail::ValueLoc<T, IndexType, false>>,
+          RAJA::reduce::detail::ValueLoc<T, IndexType, false>,
+          tuning>
 {
 public:
-  using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
-  using Combiner = RAJA::reduce::max<value_type>;
+  using value_type     = RAJA::reduce::detail::ValueLoc<T, IndexType, false>;
+  using Combiner       = RAJA::reduce::max<value_type>;
   using NonLocCombiner = RAJA::reduce::max<T>;
-  using Base = hip::Reduce<Combiner, value_type, tuning>;
+  using Base           = hip::Reduce<Combiner, value_type, tuning>;
   using Base::Base;
 
   //! constructor requires a default value for the reducer
-  ReduceMaxLoc(T init_val, IndexType init_idx,
+  ReduceMaxLoc(T init_val,
+               IndexType init_idx,
                T identity_val = NonLocCombiner::identity(),
-               IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
-      : Base(value_type(init_val, init_idx), value_type(identity_val, identity_idx))
-  {
-  }
+               IndexType identity_idx =
+                   RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+      : Base(value_type(init_val, init_idx),
+             value_type(identity_val, identity_idx))
+  {}
 
   //! reset requires a default value for the reducer
   // this must be here to hide Base::reset
-  void reset(T init_val, IndexType init_idx,
+  void reset(T init_val,
+             IndexType init_idx,
              T identity_val = NonLocCombiner::identity(),
-             IndexType identity_idx = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_idx =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
-    Base::reset(value_type(init_val, init_idx), value_type(identity_val, identity_idx));
+    Base::reset(value_type(init_val, init_idx),
+                value_type(identity_val, identity_idx));
   }
 
   //! reducer function; updates the current instance's state
diff --git a/include/RAJA/policy/hip/scan.hpp b/include/RAJA/policy/hip/scan.hpp
index cdf0a9b82d..da6bc263c0 100644
--- a/include/RAJA/policy/hip/scan.hpp
+++ b/include/RAJA/policy/hip/scan.hpp
@@ -47,17 +47,16 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive_inplace(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename InputIter,
+         typename Function>
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op)
@@ -66,23 +65,14 @@ inclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
 
@@ -92,20 +82,11 @@ inclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             len,
+  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -120,18 +101,17 @@ inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive_inplace(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename InputIter,
+         typename Function,
+         typename T>
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive_inplace(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     Function binary_op,
@@ -141,25 +121,14 @@ exclusive_inplace(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      begin,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -168,22 +137,11 @@ exclusive_inplace(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                              temp_storage_bytes,
-                                              begin,
-                                              begin,
-                                              init,
-                                              len,
-                                              binary_op,
-                                              stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      begin, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             begin,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, begin, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
@@ -198,18 +156,17 @@ exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-inclusive(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename InputIter,
+         typename OutputIter,
+         typename Function>
+RAJA_INLINE resources::EventProxy<resources::Hip> inclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -219,24 +176,14 @@ inclusive(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -244,21 +191,11 @@ inclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::inclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::inclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::InclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             len,
-                                             stream));
+  hipErrchk(::cub::DeviceScan::InclusiveScan(
+      d_temp_storage, temp_storage_bytes, begin, out, binary_op, len, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
@@ -272,19 +209,18 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename IterationMapping,
-          typename IterationGetter,
-          typename Concretizer,
-          bool Async,
-          typename InputIter,
-          typename OutputIter,
-          typename Function,
-          typename T>
-RAJA_INLINE
-resources::EventProxy<resources::Hip>
-exclusive(
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename InputIter,
+         typename OutputIter,
+         typename Function,
+         typename T>
+RAJA_INLINE resources::EventProxy<resources::Hip> exclusive(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     InputIter begin,
     InputIter end,
     OutputIter out,
@@ -295,25 +231,14 @@ exclusive(
 
   int len = std::distance(begin, end);
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Allocate temporary storage
@@ -322,22 +247,11 @@ exclusive(
           temp_storage_bytes);
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::exclusive_scan(d_temp_storage,
-                                      temp_storage_bytes,
-                                      begin,
-                                      out,
-                                      init,
-                                      len,
-                                      binary_op,
-                                      stream));
+  hipErrchk(::rocprim::exclusive_scan(d_temp_storage, temp_storage_bytes, begin,
+                                      out, init, len, binary_op, stream));
 #elif defined(__CUDACC__)
-  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage,
-                                             temp_storage_bytes,
-                                             begin,
-                                             out,
-                                             binary_op,
-                                             init,
-                                             len,
+  hipErrchk(::cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes,
+                                             begin, out, binary_op, init, len,
                                              stream));
 #endif
   // Free temporary storage
diff --git a/include/RAJA/policy/hip/sort.hpp b/include/RAJA/policy/hip/sort.hpp
index eb16246623..e1781572d6 100644
--- a/include/RAJA/policy/hip/sort.hpp
+++ b/include/RAJA/policy/hip/sort.hpp
@@ -51,52 +51,63 @@ namespace detail
 {
 
 #if defined(__HIPCC__)
-  template < typename R >
-  using double_buffer = ::rocprim::double_buffer<R>;
+template<typename R>
+using double_buffer = ::rocprim::double_buffer<R>;
 #elif defined(__CUDACC__)
-  template < typename R >
-  using double_buffer = ::cub::DoubleBuffer<R>;
+template<typename R>
+using double_buffer = ::cub::DoubleBuffer<R>;
 #endif
 
-  template < typename R >
-  R* get_current(double_buffer<R>& d_bufs)
-  {
+template<typename R>
+R* get_current(double_buffer<R>& d_bufs)
+{
 #if defined(__HIPCC__)
-    return d_bufs.current();
+  return d_bufs.current();
 #elif defined(__CUDACC__)
-    return d_bufs.Current();
+  return d_bufs.Current();
 #endif
-  }
-
 }
 
+}  // namespace detail
+
 /*!
         \brief static assert unimplemented stable sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter,
+       Iter,
+       Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA stable_sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA stable_sort<hip_exec> is only implemented for pointers to "
+      "arithmetic types and RAJA::operators::less and "
+      "RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -104,26 +115,28 @@ stable(
 /*!
         \brief stable sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::less<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -133,24 +146,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -159,29 +164,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage,
-                                       temp_storage_bytes,
-                                       d_keys,
-                                       len,
-                                       begin_bit,
-                                       end_bit,
+  hipErrchk(::rocprim::radix_sort_keys(d_temp_storage, temp_storage_bytes,
+                                       d_keys, len, begin_bit, end_bit,
                                        stream));
 #elif defined(__CUDACC__)
   cudaErrchk(::cub::DeviceRadixSort::SortKeys(d_temp_storage,
-                                              temp_storage_bytes,
-                                              d_keys,
-                                              len,
-                                              begin_bit,
-                                              end_bit,
-                                              stream));
+                                              temp_storage_bytes, d_keys, len,
+                                              begin_bit, end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -194,26 +193,28 @@ stable(
 /*!
         \brief stable sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-stable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>>)
+stable(resources::Hip hip_res,
+       ::RAJA::policy::hip::
+           hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+       Iter begin,
+       Iter end,
+       operators::greater<RAJA::detail::IterVal<Iter>>)
 {
   hipStream_t stream = hip_res.get_stream();
 
   using R = RAJA::detail::IterVal<Iter>;
 
-  int len = std::distance(begin, end);
-  int begin_bit=0;
-  int end_bit=sizeof(R)*CHAR_BIT;
+  int len       = std::distance(begin, end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(R) * CHAR_BIT;
 
   // Allocate temporary storage for the output array
   R* d_out = hip::device_mempool_type::getInstance().malloc<R>(len);
@@ -223,24 +224,16 @@ stable(
   detail::double_buffer<R> d_keys(begin, d_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -249,29 +242,23 @@ stable(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage,
-                                            temp_storage_bytes,
-                                            d_keys,
-                                            len,
-                                            begin_bit,
-                                            end_bit,
+  hipErrchk(::rocprim::radix_sort_keys_desc(d_temp_storage, temp_storage_bytes,
+                                            d_keys, len, begin_bit, end_bit,
                                             stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(d_temp_storage,
-                                                        temp_storage_bytes,
-                                                        d_keys,
-                                                        len,
-                                                        begin_bit,
-                                                        end_bit,
-                                                        stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortKeysDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, len, begin_bit, end_bit,
+      stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_out) {
+  if (detail::get_current(d_keys) == d_out)
+  {
 
     // copy
-    hipErrchk(hipMemcpyAsync(begin, d_out, len*sizeof(R), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(begin, d_out, len * sizeof(R), hipMemcpyDefault,
+                             stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_out);
@@ -281,34 +268,43 @@ stable(
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-
 /*!
         \brief static assert unimplemented sort
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                        std::is_pointer<Iter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
-    Iter,
-    Iter,
-    Compare)
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+        std::is_pointer<Iter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<Iter>>>,
+            camp::is_same<Compare,
+                          operators::greater<RAJA::detail::IterVal<Iter>>>>>>>
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+         Iter,
+         Iter,
+         Compare)
 {
-  static_assert(concepts::all_of<
-                  type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
-                  std::is_pointer<Iter>,
-                  concepts::any_of<
-                    camp::is_same<Compare, operators::less<RAJA::detail::IterVal<Iter>>>,
-                    camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<Iter>>>>>::value,
-                "RAJA sort<hip_exec> is only implemented for pointers to arithmetic types and RAJA::operators::less and RAJA::operators::greater.");
+  static_assert(
+      concepts::all_of<
+          type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
+          std::is_pointer<Iter>,
+          concepts::any_of<
+              camp::is_same<Compare,
+                            operators::less<RAJA::detail::IterVal<Iter>>>,
+              camp::is_same<Compare, operators::greater<
+                                         RAJA::detail::IterVal<Iter>>>>>::value,
+      "RAJA sort<hip_exec> is only implemented for pointers to arithmetic "
+      "types and RAJA::operators::less and RAJA::operators::greater.");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -316,18 +312,20 @@ unstable(
 /*!
         \brief sort given range in ascending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::less<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::less<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
@@ -335,56 +333,68 @@ unstable(
 /*!
         \brief sort given range in descending order
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename Iter>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename Iter>
 concepts::enable_if_t<resources::EventProxy<resources::Hip>,
                       type_traits::is_arithmetic<RAJA::detail::IterVal<Iter>>,
                       std::is_pointer<Iter>>
-unstable(
-    resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
-    Iter begin,
-    Iter end,
-    operators::greater<RAJA::detail::IterVal<Iter>> comp)
+unstable(resources::Hip hip_res,
+         ::RAJA::policy::hip::
+             hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+         Iter begin,
+         Iter end,
+         operators::greater<RAJA::detail::IterVal<Iter>> comp)
 {
   return stable(hip_res, p, begin, end, comp);
 }
 
-
 /*!
         \brief static assert unimplemented stable sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "stable_sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "stable_sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "stable_sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "stable_sort_pairs<hip_exec> is only implemented for "
+      "RAJA::operators::less or RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -392,16 +402,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -412,9 +427,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -426,26 +441,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -454,36 +459,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage,
-                                        temp_storage_bytes,
-                                        d_keys,
-                                        d_vals,
-                                        len,
-                                        begin_bit,
-                                        end_bit,
+  hipErrchk(::rocprim::radix_sort_pairs(d_temp_storage, temp_storage_bytes,
+                                        d_keys, d_vals, len, begin_bit, end_bit,
                                         stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairs(d_temp_storage,
-                                               temp_storage_bytes,
-                                               d_keys,
-                                               d_vals,
-                                               len,
-                                               begin_bit,
-                                               end_bit,
-                                               stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairs(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -497,16 +496,21 @@ stable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 stable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -517,9 +521,9 @@ stable_pairs(
   using K = RAJA::detail::IterVal<KeyIter>;
   using V = RAJA::detail::IterVal<ValIter>;
 
-  int len = std::distance(keys_begin, keys_end);
-  int begin_bit=0;
-  int end_bit=sizeof(K)*CHAR_BIT;
+  int len       = std::distance(keys_begin, keys_end);
+  int begin_bit = 0;
+  int end_bit   = sizeof(K) * CHAR_BIT;
 
   // Allocate temporary storage for the output arrays
   K* d_keys_out = hip::device_mempool_type::getInstance().malloc<K>(len);
@@ -531,26 +535,16 @@ stable_pairs(
   detail::double_buffer<V> d_vals(vals_begin, d_vals_out);
 
   // Determine temporary device storage requirements
-  void* d_temp_storage = nullptr;
+  void* d_temp_storage      = nullptr;
   size_t temp_storage_bytes = 0;
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Allocate temporary storage
   d_temp_storage =
@@ -559,36 +553,30 @@ stable_pairs(
 
   // Run
 #if defined(__HIPCC__)
-  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage,
-                                             temp_storage_bytes,
-                                             d_keys,
-                                             d_vals,
-                                             len,
-                                             begin_bit,
-                                             end_bit,
-                                             stream));
+  hipErrchk(::rocprim::radix_sort_pairs_desc(d_temp_storage, temp_storage_bytes,
+                                             d_keys, d_vals, len, begin_bit,
+                                             end_bit, stream));
 #elif defined(__CUDACC__)
-  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(d_temp_storage,
-                                                         temp_storage_bytes,
-                                                         d_keys,
-                                                         d_vals,
-                                                         len,
-                                                         begin_bit,
-                                                         end_bit,
-                                                         stream));
+  cudaErrchk(::cub::DeviceRadixSort::SortPairsDescending(
+      d_temp_storage, temp_storage_bytes, d_keys, d_vals, len, begin_bit,
+      end_bit, stream));
 #endif
   // Free temporary storage
   hip::device_mempool_type::getInstance().free(d_temp_storage);
 
-  if (detail::get_current(d_keys) == d_keys_out) {
+  if (detail::get_current(d_keys) == d_keys_out)
+  {
 
     // copy keys
-    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len*sizeof(K), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(keys_begin, d_keys_out, len * sizeof(K),
+                             hipMemcpyDefault, stream));
   }
-  if (detail::get_current(d_vals) == d_vals_out) {
+  if (detail::get_current(d_vals) == d_vals_out)
+  {
 
     // copy vals
-    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len*sizeof(V), hipMemcpyDefault, stream));
+    hipErrchk(hipMemcpyAsync(vals_begin, d_vals_out, len * sizeof(V),
+                             hipMemcpyDefault, stream));
   }
 
   hip::device_mempool_type::getInstance().free(d_keys_out);
@@ -599,40 +587,50 @@ stable_pairs(
   return resources::EventProxy<resources::Hip>(hip_res);
 }
 
-
 /*!
         \brief static assert unimplemented sort pairs
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter, typename Compare>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      concepts::negate<concepts::all_of<
-                        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                        std::is_pointer<KeyIter>,
-                        std::is_pointer<ValIter>,
-                        concepts::any_of<
-                          camp::is_same<Compare, operators::less<RAJA::detail::IterVal<KeyIter>>>,
-                          camp::is_same<Compare, operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    concepts::negate<concepts::all_of<
+        type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+        std::is_pointer<KeyIter>,
+        std::is_pointer<ValIter>,
+        concepts::any_of<
+            camp::is_same<Compare,
+                          operators::less<RAJA::detail::IterVal<KeyIter>>>,
+            camp::is_same<
+                Compare,
+                operators::greater<RAJA::detail::IterVal<KeyIter>>>>>>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>,
     KeyIter,
     KeyIter,
     ValIter,
     Compare)
 {
-  static_assert (std::is_pointer<KeyIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
-  static_assert (std::is_pointer<ValIter>::value,
-      "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<KeyIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
+  static_assert(std::is_pointer<ValIter>::value,
+                "sort_pairs<hip_exec> is only implemented for pointers");
   using K = RAJA::detail::IterVal<KeyIter>;
-  static_assert (type_traits::is_arithmetic<K>::value,
+  static_assert(
+      type_traits::is_arithmetic<K>::value,
       "sort_pairs<hip_exec> is only implemented for arithmetic types");
-  static_assert (concepts::any_of<
-      camp::is_same<Compare, operators::less<K>>,
-      camp::is_same<Compare, operators::greater<K>>>::value,
-      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or RAJA::operators::greater");
+  static_assert(
+      concepts::any_of<camp::is_same<Compare, operators::less<K>>,
+                       camp::is_same<Compare, operators::greater<K>>>::value,
+      "sort_pairs<hip_exec> is only implemented for RAJA::operators::less or "
+      "RAJA::operators::greater");
 
   return resources::EventProxy<resources::Hip>(hip_res);
 }
@@ -640,16 +638,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in ascending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
@@ -661,16 +664,21 @@ unstable_pairs(
 /*!
         \brief stable sort given range of pairs in descending order of keys
 */
-template <typename IterationMapping, typename IterationGetter,
-          typename Concretizer, bool Async,
-          typename KeyIter, typename ValIter>
-concepts::enable_if_t<resources::EventProxy<resources::Hip>,
-                      type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
-                      std::is_pointer<KeyIter>,
-                      std::is_pointer<ValIter>>
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async,
+         typename KeyIter,
+         typename ValIter>
+concepts::enable_if_t<
+    resources::EventProxy<resources::Hip>,
+    type_traits::is_arithmetic<RAJA::detail::IterVal<KeyIter>>,
+    std::is_pointer<KeyIter>,
+    std::is_pointer<ValIter>>
 unstable_pairs(
     resources::Hip hip_res,
-    ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async> p,
     KeyIter keys_begin,
     KeyIter keys_end,
     ValIter vals_begin,
diff --git a/include/RAJA/policy/openmp.hpp b/include/RAJA/policy/openmp.hpp
index fc29dabcbf..89a7997b31 100644
--- a/include/RAJA/policy/openmp.hpp
+++ b/include/RAJA/policy/openmp.hpp
@@ -30,7 +30,7 @@
 #include <thread>
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/openmp/atomic.hpp"
+#include "RAJA/policy/openmp/atomic.hpp"
 #endif
 
 #include "RAJA/policy/openmp/forall.hpp"
diff --git a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
index 09861941ab..a477710cb2 100644
--- a/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/Dispatcher.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/policy/sequential/WorkGroup/Dispatcher.hpp"
 
-
 namespace RAJA
 {
 
@@ -32,12 +31,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template<typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_work const&)
 {
-  return get_Dispatcher<T, Dispatcher_T>(seq_work{});
+  return get_Dispatcher<T, Dispatcher_T>(seq_work {});
 }
 
 }  // namespace detail
diff --git a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
index c889273a0f..e3116b1b82 100644
--- a/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp/WorkGroup/WorkRunner.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
 
-
 namespace RAJA
 {
 
@@ -35,51 +34,47 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_parallel_for_exec,
-        RAJA::omp_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::omp_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_parallel_for_exec,
+                              RAJA::omp_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp/atomic.hpp b/include/RAJA/policy/openmp/atomic.hpp
index 2dc047dd95..8dd439f964 100644
--- a/include/RAJA/policy/openmp/atomic.hpp
+++ b/include/RAJA/policy/openmp/atomic.hpp
@@ -26,7 +26,6 @@
 
 #include "RAJA/util/macros.hpp"
 
-
 namespace RAJA
 {
 
@@ -34,9 +33,8 @@ namespace RAJA
 #if !defined(RAJA_COMPILER_MSVC)
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(omp_atomic, T* acc)
 {
   T ret;
 #pragma omp atomic capture
@@ -48,23 +46,21 @@ RAJA_INLINE T atomicLoad(omp_atomic, T *acc)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(omp_atomic, T* acc, T value)
 {
   T ret;
 #pragma omp atomic capture
   {
-    ret = *acc;
+    ret  = *acc;
     *acc = value;
   }
   RAJA_UNUSED_VAR(ret);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -75,11 +71,9 @@ RAJA_INLINE T atomicAdd(omp_atomic, T *acc, T value)
   return old;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -90,18 +84,16 @@ RAJA_INLINE T atomicSub(omp_atomic, T *acc, T value)
   return old;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value < *acc )
+    if (value < *acc)
     {
       *acc = value;
     }
@@ -109,21 +101,20 @@ RAJA_INLINE T atomicMin(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMin(builtin_atomic{}, acc, value);
+  return atomicMin(builtin_atomic {}, acc, value);
 #endif
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(omp_atomic, T* acc, T value)
 {
 #if _OPENMP >= 202011
   T old;
-  #pragma omp atomic capture compare
+#pragma omp atomic capture compare
   {
     old = *acc;
-    if ( value > *acc )
+    if (value > *acc)
     {
       *acc = value;
     }
@@ -131,15 +122,13 @@ RAJA_INLINE T atomicMax(omp_atomic, T *acc, T value)
   return old;
 #else
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return atomicMax(builtin_atomic{}, acc, value);
+  return atomicMax(builtin_atomic {}, acc, value);
 #endif
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -150,21 +139,17 @@ RAJA_INLINE T atomicInc(omp_atomic, T *acc)
   return old;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicInc(builtin_atomic{}, acc, value);
+  return RAJA::atomicInc(builtin_atomic {}, acc, value);
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc)
 {
   T old;
 #pragma omp atomic capture
@@ -175,20 +160,17 @@ RAJA_INLINE T atomicDec(omp_atomic, T *acc)
   return old;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(omp_atomic, T* acc, T value)
 {
   // OpenMP doesn't define needed operations, so use builtin atomics
-  return RAJA::atomicDec(builtin_atomic{}, acc, value);
+  return RAJA::atomicDec(builtin_atomic {}, acc, value);
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -200,9 +182,8 @@ RAJA_INLINE T atomicAnd(omp_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -214,9 +195,8 @@ RAJA_INLINE T atomicOr(omp_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
@@ -228,29 +208,27 @@ RAJA_INLINE T atomicXor(omp_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(omp_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(omp_atomic, T* acc, T value)
 {
   T old;
 #pragma omp atomic capture
   {
-    old = *acc;  // capture old for return value
+    old  = *acc;  // capture old for return value
     *acc = value;
   }
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(omp_atomic, T *acc, T compare, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(omp_atomic, T* acc, T compare, T value)
 {
   // OpenMP doesn't define atomic ternary operators so use builtin atomics
-  return RAJA::atomicCAS(builtin_atomic{}, acc, compare, value);
+  return RAJA::atomicCAS(builtin_atomic {}, acc, compare, value);
 }
 
-#endif // not defined RAJA_COMPILER_MSVC
+#endif  // not defined RAJA_COMPILER_MSVC
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/openmp/forall.hpp b/include/RAJA/policy/openmp/forall.hpp
index 815168ae98..2f0d46e736 100644
--- a/include/RAJA/policy/openmp/forall.hpp
+++ b/include/RAJA/policy/openmp/forall.hpp
@@ -55,12 +55,14 @@ namespace policy
 namespace omp
 {
 
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template<typename Iterable,
+         typename Func,
+         typename InnerPolicy,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
@@ -70,12 +72,11 @@ forall_impl(resources::Host host_res,
   RAJA::region<RAJA::omp_parallel_region>([&]() {
     using RAJA::internal::thread_privatize;
     auto body = thread_privatize(loop_body);
-    forall_impl(host_res, InnerPolicy{}, iter, body.get_priv(), f_params);
+    forall_impl(host_res, InnerPolicy {}, iter, body.get_priv(), f_params);
   });
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-
 ///
 /// OpenMP parallel for schedule policy implementation
 ///
@@ -83,249 +84,285 @@ forall_impl(resources::Host host_res,
 namespace internal
 {
 
-  /// Tag dispatch for omp forall
-
-  //
-  // omp for (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+/// Tag dispatch for omp forall
 
-  //
-  // omp for schedule(static)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for (Auto)
+//
+template<typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Auto&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Static<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(dynamic, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(dynamic, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(dynamic, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(guided, ChunkSize)
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(guided, ChunkSize)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(runtime)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(guided, ChunkSize)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(guided, ChunkSize)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(runtime)
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  // TODO :: not implemented in forall param interface ...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(runtime)
+//
+template<typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime&,
+                             Iterable&& iter,
+                             Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(runtime)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
+}
 
+// TODO :: not implemented in forall param interface ...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template<typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl(const Policy&, Iterable&& iter, Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl(::RAJA::policy::omp::Runtime {}, std::forward<Iterable>(iter),
+              std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
 
-  /// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for nowait (Auto)
-  //
-  template <typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
-                               Iterable&& iter,
-                               Func&& loop_body)
-  {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
-  }
+/// Tag dispatch for omp forall with nowait
 
-  //
-  // omp for schedule(static) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for nowait (Auto)
+//
+template<typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //
-  // omp for schedule(static, ChunkSize) nowait
-  //
-  template <typename Iterable, typename Func, int ChunkSize,
-    typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-  RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static) nowait
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    RAJA_EXTRACT_BED_IT(iter);
-    #pragma omp for schedule(static, ChunkSize) nowait
-    for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-      loop_body(begin_it[i]);
-    }
+    loop_body(begin_it[i]);
   }
+}
 
-  //TODO :: not implemented in param interface...
-  #if !defined(RAJA_COMPILER_MSVC)
-  // dynamic & guided
-  template <typename Policy, typename Iterable, typename Func>
-  RAJA_INLINE void forall_impl_nowait(const Policy&,
-                               Iterable&& iter,
-                               Func&& loop_body)
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>&,
+    Iterable&& iter,
+    Func&& loop_body)
+{
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp for schedule(static, ChunkSize) nowait
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
   {
-    omp_sched_t prev_sched;
-    int prev_chunk;
-    omp_get_schedule(&prev_sched, &prev_chunk);
-    omp_set_schedule(Policy::schedule, Policy::chunk_size);
-    forall_impl_nowait(::RAJA::policy::omp::Runtime{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
-    omp_set_schedule(prev_sched, prev_chunk);
+    loop_body(begin_it[i]);
   }
-  #endif
-
-} // end namespace internal
+}
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+// TODO :: not implemented in param interface...
+#if !defined(RAJA_COMPILER_MSVC)
+// dynamic & guided
+template<typename Policy, typename Iterable, typename Func>
+RAJA_INLINE void forall_impl_nowait(const Policy&,
+                                    Iterable&& iter,
+                                    Func&& loop_body)
+{
+  omp_sched_t prev_sched;
+  int prev_chunk;
+  omp_get_schedule(&prev_sched, &prev_chunk);
+  omp_set_schedule(Policy::schedule, Policy::chunk_size);
+  forall_impl_nowait(::RAJA::policy::omp::Runtime {},
+                     std::forward<Iterable>(iter),
+                     std::forward<Func>(loop_body));
+  omp_set_schedule(prev_sched, prev_chunk);
+}
+#endif
+
+}  // end namespace internal
+
+template<typename Schedule,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                        std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+template<typename Schedule,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Host host_res,
             const omp_for_nowait_schedule_exec<Schedule>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam)
 {
-  internal::forall_impl_nowait(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body));
+  internal::forall_impl_nowait(Schedule {}, std::forward<Iterable>(iter),
+                               std::forward<Func>(loop_body));
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/kernel/Collapse.hpp b/include/RAJA/policy/openmp/kernel/Collapse.hpp
index ba71ac2fbf..db37f58552 100644
--- a/include/RAJA/policy/openmp/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp/kernel/Collapse.hpp
@@ -38,8 +38,8 @@ namespace RAJA
 struct omp_parallel_collapse_exec
     : make_policy_pattern_t<RAJA::Policy::openmp,
                             RAJA::Pattern::forall,
-                            RAJA::policy::omp::For> {
-};
+                            RAJA::policy::omp::For>
+{};
 
 namespace internal
 {
@@ -48,13 +48,18 @@ namespace internal
 // Collapsing two loops
 /////////
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template<camp::idx_t Arg0,
+         camp::idx_t Arg1,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
-  template <typename Data>
+  template<typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
     const auto l0 = segment_length<Arg0>(data);
@@ -71,39 +76,43 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1) firstprivate(privatizer)              \
     RAJA_COLLAPSE(2)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
         auto& private_data = privatizer.get_priv();
         private_data.template assign_offset<Arg0>(i0);
         private_data.template assign_offset<Arg1>(i1);
-        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
   }
 };
 
-
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t Arg0,
+         camp::idx_t Arg1,
+         camp::idx_t Arg2,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types> {
+                                             EnclosedStmts...>,
+                         Types>
+{
 
 
-  template <typename Data>
+  template<typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
     const auto l0 = segment_length<Arg0>(data);
     const auto l1 = segment_length<Arg1>(data);
     const auto l2 = segment_length<Arg2>(data);
-    auto i0 = l0;
-    auto i1 = l1;
-    auto i2 = l2;
+    auto i0       = l0;
+    auto i1       = l1;
+    auto i2       = l2;
 
     // Set the argument types for this loop
     using NewTypes0 = setSegmentTypeFromData<Types, Arg0, Data>;
@@ -112,16 +121,20 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer) \
+#pragma omp parallel for private(i0, i1, i2) firstprivate(privatizer)          \
     RAJA_COLLAPSE(3)
-    for (i0 = 0; i0 < l0; ++i0) {
-      for (i1 = 0; i1 < l1; ++i1) {
-        for (i2 = 0; i2 < l2; ++i2) {
+    for (i0 = 0; i0 < l0; ++i0)
+    {
+      for (i1 = 0; i1 < l1; ++i1)
+      {
+        for (i2 = 0; i2 < l2; ++i2)
+        {
           auto& private_data = privatizer.get_priv();
           private_data.template assign_offset<Arg0>(i0);
           private_data.template assign_offset<Arg1>(i1);
           private_data.template assign_offset<Arg2>(i2);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
@@ -129,9 +142,6 @@ struct StatementExecutor<statement::Collapse<omp_parallel_collapse_exec,
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
index 65f56010bc..eaeb84cbf0 100644
--- a/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
+++ b/include/RAJA/policy/openmp/kernel/OmpSyncThreads.hpp
@@ -29,39 +29,33 @@
 
 #include "RAJA/policy/openmp/policy.hpp"
 
-
-
 namespace RAJA
 {
 
 namespace statement
 {
-struct OmpSyncThreads : public internal::Statement<camp::nil> {
-};
+struct OmpSyncThreads : public internal::Statement<camp::nil>
+{};
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
 
 
-
-//Statement executor to synchronize omp threads inside a kernel region
+// Statement executor to synchronize omp threads inside a kernel region
 template<typename Types>
-struct StatementExecutor<statement::OmpSyncThreads, Types> {
-
-template<typename Data>
-static RAJA_INLINE void exec(Data &&)
+struct StatementExecutor<statement::OmpSyncThreads, Types>
 {
-  #pragma omp barrier
-}
 
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&&)
+  {
+#pragma omp barrier
+  }
 };
 
 
-
-
-
 }  // namespace internal
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/launch.hpp b/include/RAJA/policy/openmp/launch.hpp
index 7856bd6fda..976a9b8e06 100644
--- a/include/RAJA/policy/openmp/launch.hpp
+++ b/include/RAJA/policy/openmp/launch.hpp
@@ -24,49 +24,59 @@
 namespace RAJA
 {
 
-template <>
-struct LaunchExecute<RAJA::omp_launch_t> {
-
-  template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *, BODY const &body, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+template<>
+struct LaunchExecute<RAJA::omp_launch_t>
+{
+
+  template<typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char*,
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
     RAJA::region<RAJA::omp_parallel_region>([&]() {
+      LaunchContext ctx;
 
-        LaunchContext ctx;
-
-        using RAJA::internal::thread_privatize;
-        auto loop_body = thread_privatize(body);
+      using RAJA::internal::thread_privatize;
+      auto loop_body = thread_privatize(body);
 
-        ctx.shared_mem_ptr = (char*) malloc(params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(params.shared_mem_size);
 
-        loop_body.get_priv()(ctx);
+      loop_body.get_priv()(ctx);
 
-        free(ctx.shared_mem_ptr);
-        ctx.shared_mem_ptr = nullptr;
+      free(ctx.shared_mem_ptr);
+      ctx.shared_mem_ptr = nullptr;
     });
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
   template<typename ReduceParams, typename BODY>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name),  BODY const &body, ReduceParams &f_params)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& f_params)
   {
 
     using EXEC_POL = RAJA::omp_launch_t;
 
     expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
-    //reducer object must be named f_params as expected by macro below
+    // reducer object must be named f_params as expected by macro below
     RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-   #pragma omp parallel reduction(combine : f_params)
+#pragma omp parallel reduction(combine : f_params)
     {
 
       LaunchContext ctx;
@@ -74,7 +84,7 @@ struct LaunchExecute<RAJA::omp_launch_t> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 
-      ctx.shared_mem_ptr = (char*) malloc(launch_params.shared_mem_size);
+      ctx.shared_mem_ptr = (char*)malloc(launch_params.shared_mem_size);
 
       expt::invoke_body(f_params, loop_body.get_priv(), ctx);
 
@@ -86,18 +96,17 @@ struct LaunchExecute<RAJA::omp_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
+template<typename SEGMENT>
+struct LoopExecute<omp_parallel_for_exec, SEGMENT>
+{
 
-template <typename SEGMENT>
-struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
-
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -105,18 +114,20 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       using RAJA::internal::thread_privatize;
       auto loop_body = thread_privatize(body);
 #pragma omp for
-      for (int i = 0; i < len; i++) {
+      for (int i = 0; i < len; i++)
+      {
 
         loop_body.get_priv()(*(segment.begin() + i));
       }
     });
   }
-  template <typename BODY>
+
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -127,8 +138,10 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
           loop_body.get_priv()(*(segment0.begin() + i),
                                *(segment1.begin() + j));
@@ -137,13 +150,13 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
     });
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -155,9 +168,12 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
                                  *(segment2.begin() + k));
@@ -168,51 +184,55 @@ struct LoopExecute<omp_parallel_for_exec, SEGMENT> {
   }
 };
 
-template <typename SEGMENT>
-struct LoopExecute<omp_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopExecute<omp_for_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -220,11 +240,13 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
@@ -235,54 +257,55 @@ struct LoopExecute<omp_for_exec, SEGMENT> {
 //
 // Return local index
 //
-template <typename SEGMENT>
-struct LoopICountExecute<omp_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopICountExecute<omp_for_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 
 #pragma omp for
-      for (int i = 0; i < len; i++) {
-        body(*(segment.begin() + i), i);
-      }
+    for (int i = 0; i < len; i++)
+    {
+      body(*(segment.begin() + i), i);
+    }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
-               i,
-               j);
-        }
+        body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
+    }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -290,33 +313,33 @@ struct LoopICountExecute<omp_for_exec, SEGMENT> {
     const int len0 = segment0.end() - segment0.begin();
 
 #pragma omp for
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
-            body(*(segment0.begin() + i),
-                 *(segment1.begin() + j),
-                 *(segment2.begin() + k),
-                 i,
-                 j,
-                 k);
-          }
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
+               *(segment2.begin() + k), i, j, k);
         }
       }
+    }
   }
 };
 
 // policy for perfectly nested loops
 struct omp_parallel_nested_for_exec;
 
-template <typename SEGMENT>
-struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -327,8 +350,10 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
           loop_body.get_priv()(*(segment0.begin() + i),
                                *(segment1.begin() + j));
@@ -337,13 +362,13 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
     });
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -355,9 +380,12 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
                                  *(segment2.begin() + k));
@@ -369,15 +397,16 @@ struct LoopExecute<omp_parallel_nested_for_exec, SEGMENT> {
 };
 
 // Return local index
-template <typename SEGMENT>
-struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     const int len1 = segment1.end() - segment1.begin();
@@ -388,25 +417,25 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(2)
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
 
-          loop_body.get_priv()(*(segment0.begin() + i),
-                               *(segment1.begin() + j),
-                               i,
-                               j);
+          loop_body.get_priv()(*(segment0.begin() + i), *(segment1.begin() + j),
+                               i, j);
         }
       }
     });
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     const int len2 = segment2.end() - segment2.begin();
@@ -418,15 +447,15 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for RAJA_COLLAPSE(3)
-      for (int k = 0; k < len2; k++) {
-        for (int j = 0; j < len1; j++) {
-          for (int i = 0; i < len0; i++) {
+      for (int k = 0; k < len2; k++)
+      {
+        for (int j = 0; j < len1; j++)
+        {
+          for (int i = 0; i < len0; i++)
+          {
             loop_body.get_priv()(*(segment0.begin() + i),
                                  *(segment1.begin() + j),
-                                 *(segment2.begin() + k),
-                                 i,
-                                 j,
-                                 k);
+                                 *(segment2.begin() + k), i, j, k);
           }
         }
       }
@@ -434,16 +463,16 @@ struct LoopICountExecute<omp_parallel_nested_for_exec, SEGMENT> {
   }
 };
 
+template<typename SEGMENT>
+struct TileExecute<omp_parallel_for_exec, SEGMENT>
+{
 
-template <typename SEGMENT>
-struct TileExecute<omp_parallel_for_exec, SEGMENT> {
-
-  template <typename BODY, typename TILE_T>
+  template<typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
@@ -453,25 +482,27 @@ struct TileExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp for
-      for (int i = 0; i < len; i += tile_size) {
+      for (int i = 0; i < len; i += tile_size)
+      {
         loop_body.get_priv()(segment.slice(i, tile_size));
       }
     });
   }
 };
 
-template <typename SEGMENT>
-struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct TileTCountExecute<omp_parallel_for_exec, SEGMENT>
+{
 
-  template <typename BODY, typename TILE_T>
+  template<typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
     RAJA::region<RAJA::omp_parallel_region>([&]() {
@@ -479,7 +510,8 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
       auto loop_body = thread_privatize(body);
 
 #pragma omp parallel for
-      for (int i = 0; i < numTiles; i++) {
+      for (int i = 0; i < numTiles; i++)
+      {
         const int i_tile_size = i * tile_size;
         loop_body.get_priv()(segment.slice(i_tile_size, tile_size), i);
       }
@@ -487,41 +519,45 @@ struct TileTCountExecute<omp_parallel_for_exec, SEGMENT> {
   }
 };
 
-template <typename SEGMENT>
-struct TileExecute<omp_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct TileExecute<omp_for_exec, SEGMENT>
+{
 
-  template <typename BODY, typename TILE_T>
+  template<typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     int len = segment.end() - segment.begin();
 #pragma omp for
-    for (int i = 0; i < len; i += tile_size) {
+    for (int i = 0; i < len; i += tile_size)
+    {
       body(segment.slice(i, tile_size));
     }
   }
 };
 
-template <typename SEGMENT>
-struct TileTCountExecute<omp_for_exec, SEGMENT> {
+template<typename SEGMENT>
+struct TileTCountExecute<omp_for_exec, SEGMENT>
+{
 
-  template <typename BODY, typename TILE_T>
+  template<typename BODY, typename TILE_T>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len      = segment.end() - segment.begin();
     const int numTiles = (len - 1) / tile_size + 1;
 
 #pragma omp for
-    for (int i = 0; i < numTiles; i++) {
+    for (int i = 0; i < numTiles; i++)
+    {
       const int i_tile_size = i * tile_size;
       body(segment.slice(i_tile_size, tile_size), i);
     }
diff --git a/include/RAJA/policy/openmp/multi_reduce.hpp b/include/RAJA/policy/openmp/multi_reduce.hpp
index 22b09a7722..2dd803b71b 100644
--- a/include/RAJA/policy/openmp/multi_reduce.hpp
+++ b/include/RAJA/policy/openmp/multi_reduce.hpp
@@ -56,7 +56,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template<typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataOMP;
 
 /*!
@@ -68,47 +68,56 @@ struct MultiReduceDataOMP;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
+template<typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_destruction>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template<typename Container,
+           std::enable_if_t<
+               !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(nullptr)
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(nullptr)
   {
-    m_data = create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins), other.m_num_bins);
+    m_data =
+        create_data(RepeatView<value_type>(other.m_identity, other.m_num_bins),
+                    other.m_num_bins);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (m_parent && (m_num_bins != size_t(0))) {
+    if (m_data)
+    {
+      if (m_parent && (m_num_bins != size_t(0)))
+      {
 #pragma omp critical(ompMultiReduceCritical)
         {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            MultiReduceOp{}(m_parent->m_data[bin], m_data[bin]);
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            MultiReduceOp {}(m_parent->m_data[bin], m_data[bin]);
           }
         }
       }
@@ -116,18 +125,22 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     }
   }
 
-  template < typename Container >
+  template<typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -138,26 +151,29 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template<typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, num_bins * sizeof(T) );
+    auto data =
+        RAJA::allocate_aligned_type<T>(RAJA::DATA_ALIGN, num_bins * sizeof(T));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -165,11 +181,13 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t bin = num_bins; bin > 0; --bin) {
-      data[bin-1].~T();
+    for (size_t bin = num_bins; bin > 0; --bin)
+    {
+      data[bin - 1].~T();
     }
     RAJA::free_aligned(data);
     data = nullptr;
@@ -185,74 +203,93 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataOMP<T, t_MultiReduceOp,
-    RAJA::omp::MultiReduceTuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>>
+template<typename T, typename t_MultiReduceOp>
+struct MultiReduceDataOMP<
+    T,
+    t_MultiReduceOp,
+    RAJA::omp::MultiReduceTuning<
+        RAJA::omp::multi_reduce_algorithm::combine_on_get>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataOMP() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr >
+  template<typename Container,
+           std::enable_if_t<
+               !std::is_same<Container, MultiReduceDataOMP>::value>* = nullptr>
   MultiReduceDataOMP(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_max_threads(omp_get_max_threads())
-      , m_num_bins(container.size())
-      , m_padded_threads(pad_threads(m_max_threads))
-      , m_padded_bins(pad_bins(m_num_bins))
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_max_threads(omp_get_max_threads()),
+        m_num_bins(container.size()),
+        m_padded_threads(pad_threads(m_max_threads)),
+        m_padded_bins(pad_bins(m_num_bins)),
+        m_identity(identity),
+        m_data(nullptr)
   {
-    m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                         m_padded_bins, m_padded_threads);
   }
 
-  MultiReduceDataOMP(MultiReduceDataOMP const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_padded_threads(other.m_padded_threads)
-      , m_padded_bins(other.m_padded_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataOMP(MultiReduceDataOMP const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_padded_threads(other.m_padded_threads),
+        m_padded_bins(other.m_padded_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataOMP(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP(MultiReduceDataOMP&&)                 = delete;
   MultiReduceDataOMP& operator=(MultiReduceDataOMP const&) = delete;
-  MultiReduceDataOMP& operator=(MultiReduceDataOMP &&) = delete;
+  MultiReduceDataOMP& operator=(MultiReduceDataOMP&&)      = delete;
 
   ~MultiReduceDataOMP()
   {
-    if (m_data) {
-      if (!m_parent) {
-        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
+    if (m_data)
+    {
+      if (!m_parent)
+      {
+        destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                     m_padded_threads);
       }
     }
   }
 
-  template < typename Container >
+  template<typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
-      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-      m_num_bins = new_num_bins;
+    if (new_num_bins != m_num_bins)
+    {
+      destroy_data(m_data, m_num_bins, m_max_threads, m_padded_bins,
+                   m_padded_threads);
+      m_num_bins    = new_num_bins;
       m_padded_bins = pad_bins(m_num_bins);
-      m_data = create_data(container, identity, m_num_bins, m_max_threads, m_padded_bins, m_padded_threads);
-    } else {
-      if (m_max_threads > 0) {
+      m_data = create_data(container, identity, m_num_bins, m_max_threads,
+                           m_padded_bins, m_padded_threads);
+    }
+    else
+    {
+      if (m_max_threads > 0)
+      {
         {
           size_t thread_idx = 0;
-          size_t bin = 0;
-          for (auto const& value : container) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = value;
+          size_t bin        = 0;
+          for (auto const& value : container)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = value;
             ++bin;
           }
         }
-        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx) {
-          for (size_t bin = 0; bin < m_num_bins; ++bin) {
-            m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)] = identity;
+        for (size_t thread_idx = 1; thread_idx < m_max_threads; ++thread_idx)
+        {
+          for (size_t bin = 0; bin < m_num_bins; ++bin)
+          {
+            m_data[index_data(bin, thread_idx, m_padded_bins,
+                              m_padded_threads)] = identity;
           }
         }
       }
@@ -263,24 +300,28 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val)
+  void combine(size_t bin, T const& val)
   {
     size_t thread_idx = omp_get_thread_num();
-    MultiReduceOp{}(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)], val);
+    MultiReduceOp {}(
+        m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)],
+        val);
   }
 
   T get(size_t bin) const
   {
     ::RAJA::detail::HighAccuracyReduce<T, typename MultiReduceOp::operator_type>
         reducer(m_identity);
-    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx) {
-      reducer.combine(m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
+    for (size_t thread_idx = 0; thread_idx < m_max_threads; ++thread_idx)
+    {
+      reducer.combine(
+          m_data[index_data(bin, thread_idx, m_padded_bins, m_padded_threads)]);
     }
     return reducer.get_and_clear();
   }
 
 private:
-  MultiReduceDataOMP const *m_parent;
+  MultiReduceDataOMP const* m_parent;
   size_t m_max_threads;
   size_t m_num_bins;
   size_t m_padded_threads;
@@ -290,8 +331,10 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
   static constexpr size_t pad_bins(size_t num_bins)
   {
-    size_t num_cache_lines = RAJA_DIVIDE_CEILING_INT(num_bins*sizeof(T), RAJA::DATA_ALIGN);
-    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN, sizeof(T));
+    size_t num_cache_lines =
+        RAJA_DIVIDE_CEILING_INT(num_bins * sizeof(T), RAJA::DATA_ALIGN);
+    return RAJA_DIVIDE_CEILING_INT(num_cache_lines * RAJA::DATA_ALIGN,
+                                   sizeof(T));
   }
 
   static constexpr size_t pad_threads(size_t max_threads)
@@ -299,33 +342,46 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
     return max_threads;
   }
 
-  static constexpr size_t index_data(size_t bin, size_t thread_idx,
-                                     size_t padded_bins, size_t RAJA_UNUSED_ARG(padded_threads))
+  static constexpr size_t index_data(size_t bin,
+                                     size_t thread_idx,
+                                     size_t padded_bins,
+                                     size_t RAJA_UNUSED_ARG(padded_threads))
   {
     return bin + thread_idx * padded_bins;
   }
 
-  template < typename Container >
-  static T* create_data(Container const& container, T identity,
-                        size_t num_bins, size_t max_threads,
-                        size_t padded_bins, size_t padded_threads)
+  template<typename Container>
+  static T* create_data(Container const& container,
+                        T identity,
+                        size_t num_bins,
+                        size_t max_threads,
+                        size_t padded_bins,
+                        size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
-    auto data = RAJA::allocate_aligned_type<T>( RAJA::DATA_ALIGN, padded_threads*padded_bins*sizeof(T) );
-    if (max_threads > 0) {
+    auto data = RAJA::allocate_aligned_type<T>(
+        RAJA::DATA_ALIGN, padded_threads * padded_bins * sizeof(T));
+    if (max_threads > 0)
+    {
       {
         size_t thread_idx = 0;
-        size_t bin = 0;
-        for (auto const& value : container) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(value);
+        size_t bin        = 0;
+        for (auto const& value : container)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(value);
           ++bin;
         }
       }
-      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx) {
-        for (size_t bin = 0; bin < num_bins; ++bin) {
-          new(&data[index_data(bin, thread_idx, padded_bins, padded_threads)]) T(identity);
+      for (size_t thread_idx = 1; thread_idx < max_threads; ++thread_idx)
+      {
+        for (size_t bin = 0; bin < num_bins; ++bin)
+        {
+          new (&data[index_data(bin, thread_idx, padded_bins, padded_threads)])
+              T(identity);
         }
       }
     }
@@ -333,15 +389,21 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
   }
 
   static void destroy_data(T*& data,
-                           size_t num_bins, size_t max_threads,
-                           size_t padded_bins, size_t padded_threads)
+                           size_t num_bins,
+                           size_t max_threads,
+                           size_t padded_bins,
+                           size_t padded_threads)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
-    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx) {
-      for (size_t bin = num_bins; bin > 0; --bin) {
-        data[index_data(bin-1, thread_idx-1, padded_bins, padded_threads)].~T();
+    for (size_t thread_idx = max_threads; thread_idx > 0; --thread_idx)
+    {
+      for (size_t bin = num_bins; bin > 0; --bin)
+      {
+        data[index_data(bin - 1, thread_idx - 1, padded_bins, padded_threads)]
+            .~T();
       }
     }
     RAJA::free_aligned(data);
@@ -351,7 +413,8 @@ struct MultiReduceDataOMP<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy, detail::MultiReduceDataOMP)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::omp::omp_multi_reduce_policy,
+                                detail::MultiReduceDataOMP)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/openmp/params/forall.hpp b/include/RAJA/policy/openmp/params/forall.hpp
index d9bea5d0d8..25b28b085d 100644
--- a/include/RAJA/policy/openmp/params/forall.hpp
+++ b/include/RAJA/policy/openmp/params/forall.hpp
@@ -18,303 +18,356 @@ namespace omp
 namespace expt
 {
 
-  namespace internal
-  {
-    //
-    // omp for (Auto)
-    //
-    template <typename ExecPol, typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol, RAJA::policy::omp::Auto> >
-    forall_impl(const ExecPol& p,
-                Iterable&& iter,
-                Func&& loop_body,
-                ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
-
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+namespace internal
+{
+//
+// omp for (Auto)
+//
+template<typename ExecPol,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if<std::is_same<ExecPol, RAJA::policy::omp::Auto>>
+forall_impl(const ExecPol& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize <= 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static)
+//
+template<template<int> class ExecPol,
+         typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize <= 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(static, ChunkSize)
-    //
-    template <template<int> class ExecPol, typename Iterable, typename Func, int ChunkSize, typename ForallParam>
-    RAJA_INLINE 
-    concepts::enable_if< std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
-                         std::integral_constant<bool,(ChunkSize > 0)> >
-    forall_impl(const ExecPol<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(static, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(static, ChunkSize)
+//
+template<template<int> class ExecPol,
+         typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if<
+    std::is_same<ExecPol<ChunkSize>, RAJA::policy::omp::Static<ChunkSize>>,
+    std::integral_constant<bool, (ChunkSize > 0)>>
+forall_impl(const ExecPol<ChunkSize>& p,
+            Iterable&& iter,
+            Func&& loop_body,
+            ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(static, ChunkSize)                           \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(runtime)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(runtime)
+//
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Runtime& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(runtime) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(runtime) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for nowait (Auto)
-    //
-    template <typename Iterable, typename Func, typename ForallParam>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for nowait (Auto)
+//
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Auto& p,
+                                    Iterable&& iter,
+                                    Func&& loop_body,
+                                    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
-
-    //
-    // omp for schedule(dynamic)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
+  {
+#pragma omp for nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
     {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+    }
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-    //
-    // omp for schedule(dynamic, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(dynamic, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+//
+// omp for schedule(dynamic, ChunkSize)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Dynamic<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(dynamic, ChunkSize)                          \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-    //
-    // omp for schedule(guided, ChunkSize)
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
-      #pragma omp parallel for schedule(guided, ChunkSize) reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
+//
+// omp for schedule(guided, ChunkSize)
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl(const ::RAJA::policy::omp::Guided<ChunkSize>& p,
+                             Iterable&& iter,
+                             Func&& loop_body,
+                             ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
+#pragma omp parallel for schedule(guided, ChunkSize)                           \
+    reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
+    RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
+  }
 
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
-    }
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-    //
-    // omp for schedule(static) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+//
+// omp for schedule(static) nowait
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize <= 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
-      RAJA_EXTRACT_BED_IT(iter);
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-    //
-    // omp for schedule(static, ChunkSize) nowait
-    //
-    template <typename Iterable, typename Func, int ChunkSize, typename ForallParam,
-      typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
-    RAJA_INLINE void forall_impl_nowait(const ::RAJA::policy::omp::Static<ChunkSize>& p,
-                                 Iterable&& iter,
-                                 Func&& loop_body,
-                                 ForallParam&& f_params)
-    {
-      using EXEC_POL = typename std::decay<decltype(p)>::type;
-      RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
-      RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-      RAJA_EXTRACT_BED_IT(iter);
+//
+// omp for schedule(static, ChunkSize) nowait
+//
+template<typename Iterable,
+         typename Func,
+         int ChunkSize,
+         typename ForallParam,
+         typename std::enable_if<(ChunkSize > 0)>::type* = nullptr>
+RAJA_INLINE void forall_impl_nowait(
+    const ::RAJA::policy::omp::Static<ChunkSize>& p,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam&& f_params)
+{
+  using EXEC_POL = typename std::decay<decltype(p)>::type;
+  RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
+  RAJA_OMP_DECLARE_REDUCTION_COMBINE;
+
+  RAJA_EXTRACT_BED_IT(iter);
 #pragma omp parallel
-      {
-      #pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
-      for (decltype(distance_it) i = 0; i < distance_it; ++i) {
-        RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
-      }
-      }
-
-      RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+  {
+#pragma omp for schedule(static, ChunkSize) nowait reduction(combine : f_params)
+    for (decltype(distance_it) i = 0; i < distance_it; ++i)
+    {
+      RAJA::expt::invoke_body(f_params, loop_body, begin_it[i]);
     }
+  }
 
-  } //  namespace internal
+  RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
+}
 
-  template <typename Schedule, typename Iterable, typename Func, typename ForallParam>
-  RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(resources::Host host_res,
-                                                                 const omp_for_schedule_exec<Schedule>&,
-                                                                 Iterable&& iter,
-                                                                 Func&& loop_body,
-                                                                 ForallParam f_params)
-  {
-    expt::internal::forall_impl(Schedule{}, std::forward<Iterable>(iter), std::forward<Func>(loop_body), std::forward<ForallParam>(f_params));
-    return resources::EventProxy<resources::Host>(host_res);
-  }
-} //  namespace expt
+}  //  namespace internal
+
+template<typename Schedule,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE resources::EventProxy<resources::Host> forall_impl(
+    resources::Host host_res,
+    const omp_for_schedule_exec<Schedule>&,
+    Iterable&& iter,
+    Func&& loop_body,
+    ForallParam f_params)
+{
+  expt::internal::forall_impl(Schedule {}, std::forward<Iterable>(iter),
+                              std::forward<Func>(loop_body),
+                              std::forward<ForallParam>(f_params));
+  return resources::EventProxy<resources::Host>(host_res);
+}
+}  //  namespace expt
 
 ///
 /// OpenMP parallel policy implementation
 ///
-template <typename Iterable, typename Func, typename InnerPolicy, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+template<typename Iterable,
+         typename Func,
+         typename InnerPolicy,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Host host_res,
             const omp_parallel_exec<InnerPolicy>&,
             Iterable&& iter,
             Func&& loop_body,
             ForallParam f_params)
 {
-  expt::forall_impl(host_res, InnerPolicy{}, iter, loop_body, f_params);
+  expt::forall_impl(host_res, InnerPolicy {}, iter, loop_body, f_params);
   return resources::EventProxy<resources::Host>(host_res);
 }
 
diff --git a/include/RAJA/policy/openmp/params/kernel_name.hpp b/include/RAJA/policy/openmp/params/kernel_name.hpp
index 65a5f7a329..bf465ff0f1 100644
--- a/include/RAJA/policy/openmp/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp/params/kernel_name.hpp
@@ -3,38 +3,43 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template<typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> combine(
+    KernelName&,
+    T& /*place holder argument*/)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/params/reduce.hpp b/include/RAJA/policy/openmp/params/reduce.hpp
index f71efc255a..68ca653f28 100644
--- a/include/RAJA/policy/openmp/params/reduce.hpp
+++ b/include/RAJA/policy/openmp/params/reduce.hpp
@@ -3,37 +3,44 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_openmp_policy<EXEC_POL>> resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp/policy.hpp b/include/RAJA/policy/openmp/policy.hpp
index aff2567474..8eb7ca5135 100644
--- a/include/RAJA/policy/openmp/policy.hpp
+++ b/include/RAJA/policy/openmp/policy.hpp
@@ -26,15 +26,16 @@
 #include "RAJA/policy/atomic_builtin.hpp"
 
 #if defined(RAJA_COMPILER_MSVC)
-typedef enum omp_sched_t { 
-    // schedule kinds 
-    omp_sched_static = 0x1, 
-    omp_sched_dynamic = 0x2, 
-    omp_sched_guided = 0x3, 
-    omp_sched_auto = 0x4, 
-    
-    // schedule modifier 
-    omp_sched_monotonic = 0x80000000u 
+typedef enum omp_sched_t
+{
+  // schedule kinds
+  omp_sched_static  = 0x1,
+  omp_sched_dynamic = 0x2,
+  omp_sched_guided  = 0x3,
+  omp_sched_auto    = 0x4,
+
+  // schedule modifier
+  omp_sched_monotonic = 0x80000000u
 } omp_sched_t;
 #else
 #include <omp.h>
@@ -51,7 +52,7 @@ enum struct multi_reduce_algorithm : int
   combine_on_get
 };
 
-template < multi_reduce_algorithm t_algorithm >
+template<multi_reduce_algorithm t_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_algorithm;
@@ -59,7 +60,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::combine_on_get);
 };
 
-} // namspace omp
+}  // namespace omp
 
 namespace policy
 {
@@ -68,14 +69,16 @@ namespace omp
 
 namespace internal
 {
-    struct ScheduleTag {};
-
-    template <omp_sched_t Sched, int Chunk>
-    struct Schedule : public ScheduleTag {
-        constexpr static omp_sched_t schedule = Sched;
-        constexpr static int chunk_size = Chunk;
-        constexpr static Policy policy = Policy::openmp;
-    };
+struct ScheduleTag
+{};
+
+template<omp_sched_t Sched, int Chunk>
+struct Schedule : public ScheduleTag
+{
+  constexpr static omp_sched_t schedule = Sched;
+  constexpr static int chunk_size       = Chunk;
+  constexpr static Policy policy        = Policy::openmp;
+};
 }  // namespace internal
 
 //
@@ -86,32 +89,33 @@ namespace internal
 //////////////////////////////////////////////////////////////////////
 //
 
-struct Parallel {
-};
+struct Parallel
+{};
 
-struct For {
-};
+struct For
+{};
 
-struct NoWait {
-};
+struct NoWait
+{};
 
 static constexpr int default_chunk_size = -1;
 
-struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>{
-};
+struct Auto : public internal::Schedule<omp_sched_auto, default_chunk_size>
+{};
 
-template <int ChunkSize = default_chunk_size>
-struct Static : public internal::Schedule<omp_sched_static, ChunkSize> {
-};
+template<int ChunkSize = default_chunk_size>
+struct Static : public internal::Schedule<omp_sched_static, ChunkSize>
+{};
 
-template <int ChunkSize = default_chunk_size>
+template<int ChunkSize = default_chunk_size>
 using Dynamic = internal::Schedule<omp_sched_dynamic, ChunkSize>;
 
-template <int ChunkSize = default_chunk_size>
+template<int ChunkSize = default_chunk_size>
 using Guided = internal::Schedule<omp_sched_guided, ChunkSize>;
 
-struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), default_chunk_size> {
-};
+struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1),
+                                            default_chunk_size>
+{};
 
 //
 //////////////////////////////////////////////////////////////////////
@@ -122,54 +126,57 @@ struct Runtime : private internal::Schedule<static_cast<omp_sched_t>(-1), defaul
 //
 
 ///
-///  Struct supporting OpenMP parallel region. 
+///  Struct supporting OpenMP parallel region.
 ///
 struct omp_parallel_region
     : make_policy_pattern_launch_platform_t<Policy::openmp,
                                             Pattern::region,
                                             Launch::undefined,
-                                            Platform::host> {
-};
+                                            Platform::host>
+{};
 
 ///
 ///  Struct supporting OpenMP parallel region for Teams
 ///
-struct omp_launch_t
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::region,
-                                            Launch::undefined,
-                                            Platform::host> {
-};
-
+struct omp_launch_t : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                                            Pattern::region,
+                                                            Launch::undefined,
+                                                            Platform::host>
+{};
 
 ///
 ///  Struct supporting OpenMP 'for nowait schedule( )'
 ///
-template <typename Sched>
-struct omp_for_nowait_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              omp::NoWait,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+template<typename Sched>
+struct omp_for_nowait_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            omp::NoWait,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
-
 ///
 ///  Struct supporting OpenMP 'for schedule( )'
 ///
-template <typename Sched>
-struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                                              Pattern::forall,
-                                                              Launch::undefined,
-                                                              Platform::host,
-                                                              omp::For,
-                                                              Sched> {
-    static_assert(std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
-        "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
+template<typename Sched>
+struct omp_for_schedule_exec
+    : make_policy_pattern_launch_platform_t<Policy::openmp,
+                                            Pattern::forall,
+                                            Launch::undefined,
+                                            Platform::host,
+                                            omp::For,
+                                            Sched>
+{
+  static_assert(
+      std::is_base_of<::RAJA::policy::omp::internal::ScheduleTag, Sched>::value,
+      "Schedule type must be one of: Auto|Runtime|Static|Dynamic|Guided");
 };
 
 ///
@@ -179,15 +186,15 @@ struct omp_for_schedule_exec : make_policy_pattern_launch_platform_t<Policy::ope
 using omp_for_exec = omp_for_schedule_exec<Auto>;
 
 ///
-template <int ChunkSize = default_chunk_size>
+template<int ChunkSize = default_chunk_size>
 using omp_for_static_exec = omp_for_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
-template <int ChunkSize = default_chunk_size>
+template<int ChunkSize = default_chunk_size>
 using omp_for_dynamic_exec = omp_for_schedule_exec<omp::Dynamic<ChunkSize>>;
 
 ///
-template <int ChunkSize = default_chunk_size>
+template<int ChunkSize = default_chunk_size>
 using omp_for_guided_exec = omp_for_schedule_exec<omp::Guided<ChunkSize>>;
 
 ///
@@ -196,52 +203,58 @@ using omp_for_runtime_exec = omp_for_schedule_exec<omp::Runtime>;
 
 ///
 ///  Internal type aliases supporting 'omp for schedule( ) nowait' for specific
-///  schedule types. 
+///  schedule types.
 ///
 ///  IMPORTANT: We only provide a nowait policy option for static scheduling
 ///             since that is the only scheduling case that can be used with
-///             nowait and be correct in general. Paraphrasing the OpenMP 
+///             nowait and be correct in general. Paraphrasing the OpenMP
 ///             standard:
-///             
-///             Programs that depend on which thread executes a particular 
+///
+///             Programs that depend on which thread executes a particular
 ///             iteration under any circumstance other than static schedule
 ///             are non-conforming.
 ///
-template <int ChunkSize = default_chunk_size>
-using omp_for_nowait_static_exec = omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
+template<int ChunkSize = default_chunk_size>
+using omp_for_nowait_static_exec =
+    omp_for_nowait_schedule_exec<omp::Static<ChunkSize>>;
 
 ///
 ///  Struct supporting OpenMP 'parallel' region containing an inner loop
 ///  execution construct.
 ///
-template <typename InnerPolicy>
-using omp_parallel_exec = make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::forall,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            omp::Parallel,
-                                            wrapper<InnerPolicy>>;
+template<typename InnerPolicy>
+using omp_parallel_exec =
+    make_policy_pattern_launch_platform_t<Policy::openmp,
+                                          Pattern::forall,
+                                          Launch::undefined,
+                                          Platform::host,
+                                          omp::Parallel,
+                                          wrapper<InnerPolicy>>;
 
 ///
-///  Internal type aliases supporting 'omp parallel for schedule( )' for 
+///  Internal type aliases supporting 'omp parallel for schedule( )' for
 ///  specific schedule types.
 ///
 using omp_parallel_for_exec = omp_parallel_exec<omp_for_exec>;
 
 ///
-template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_static_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>> >;
+template<int ChunkSize = default_chunk_size>
+using omp_parallel_for_static_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Static<ChunkSize>>>;
 
 ///
-template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_dynamic_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>> >;
+template<int ChunkSize = default_chunk_size>
+using omp_parallel_for_dynamic_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Dynamic<ChunkSize>>>;
 
 ///
-template <int ChunkSize = default_chunk_size>
-using omp_parallel_for_guided_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>> >;
+template<int ChunkSize = default_chunk_size>
+using omp_parallel_for_guided_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Guided<ChunkSize>>>;
 
 ///
-using omp_parallel_for_runtime_exec = omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
+using omp_parallel_for_runtime_exec =
+    omp_parallel_exec<omp_for_schedule_exec<omp::Runtime>>;
 
 
 ///
@@ -256,7 +269,6 @@ using omp_parallel_for_segit = omp_parallel_for_exec;
 ///
 using omp_parallel_segit = omp_parallel_for_segit;
 
-
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -265,14 +277,13 @@ using omp_parallel_segit = omp_parallel_for_segit;
 ///////////////////////////////////////////////////////////////////////
 ///
 struct omp_taskgraph_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 ///
 struct omp_taskgraph_interval_segit
-    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel> {
-};
-
+    : make_policy_pattern_t<Policy::openmp, Pattern::taskgraph, omp::Parallel>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -284,8 +295,8 @@ struct omp_taskgraph_interval_segit
 struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -294,31 +305,31 @@ struct omp_work : make_policy_pattern_launch_platform_t<Policy::openmp,
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce> {
-};
+struct omp_reduce : make_policy_pattern_t<Policy::openmp, Pattern::reduce>
+{};
 
 ///
 struct omp_reduce_ordered
-    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered> {
-};
+    : make_policy_pattern_t<Policy::openmp, Pattern::reduce, reduce::ordered>
+{};
 
 ///
-template < typename tuning >
-struct omp_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::openmp,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template<typename tuning>
+struct omp_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::openmp,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 struct omp_synchronize : make_policy_pattern_launch_t<Policy::openmp,
                                                       Pattern::synchronize,
-                                                      Launch::sync> {
-};
+                                                      Launch::sync>
+{};
 
 #if defined(RAJA_COMPILER_MSVC)
 
@@ -327,14 +338,15 @@ using omp_atomic = builtin_atomic;
 
 #else  // RAJA_COMPILER_MSVC not defined
 
-struct omp_atomic {};
+struct omp_atomic
+{};
 
 #endif
 
 
-template < RAJA::omp::multi_reduce_algorithm algorithm >
-using omp_multi_reduce_tuning = omp_multi_reduce_policy<
-    RAJA::omp::MultiReduceTuning<algorithm> >;
+template<RAJA::omp::multi_reduce_algorithm algorithm>
+using omp_multi_reduce_tuning =
+    omp_multi_reduce_policy<RAJA::omp::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - combine_on_destruction policies combine new values into a single value for
@@ -344,8 +356,8 @@ using omp_multi_reduce_combine_on_destruction = omp_multi_reduce_tuning<
     RAJA::omp::multi_reduce_algorithm::combine_on_destruction>;
 // - combine_on_get policies combine new values into a single value for
 //   each thread then when get is called those values are combined.
-using omp_multi_reduce_combine_on_get = omp_multi_reduce_tuning<
-    RAJA::omp::multi_reduce_algorithm::combine_on_get>;
+using omp_multi_reduce_combine_on_get =
+    omp_multi_reduce_tuning<RAJA::omp::multi_reduce_algorithm::combine_on_get>;
 
 // Policy for RAJA::MultiReduce* objects that gives the
 // same answer every time when used in the same way
@@ -360,7 +372,6 @@ using omp_multi_reduce = omp_multi_reduce_unordered;
 }  // namespace omp
 }  // namespace policy
 
-
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
@@ -395,18 +406,19 @@ using policy::omp::omp_parallel_for_segit;
 using policy::omp::omp_parallel_segit;
 
 ///
-/// Type alias for omp parallel region containing an inner 'omp for' loop 
+/// Type alias for omp parallel region containing an inner 'omp for' loop
 /// execution policy. Inner policy types follow.
 ///
 using policy::omp::omp_parallel_exec;
 
 ///
-/// Type alias for 'omp for' loop execution within an omp_parallel_exec construct
+/// Type alias for 'omp for' loop execution within an omp_parallel_exec
+/// construct
 ///
 using policy::omp::omp_for_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// scheduling policy within an omp_parallel_exec construct
 /// Scheduling policies are near the top of this file and include:
 /// RAJA::policy::omp::{Auto, Static, Dynamic, Guided, Runtime}
@@ -421,7 +433,7 @@ using policy::omp::omp_for_schedule_exec;
 using policy::omp::omp_for_nowait_schedule_exec;
 
 ///
-/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a 
+/// Type aliases for 'omp for' and 'omp for nowait' loop execution with a
 /// static scheduling policy within an omp_parallel_exec construct
 ///
 using policy::omp::omp_for_static_exec;
@@ -437,8 +449,8 @@ using policy::omp::omp_for_runtime_exec;
 ///
 /// Type aliases for omp parallel region
 ///
-using policy::omp::omp_parallel_region;
 using policy::omp::omp_launch_t;
+using policy::omp::omp_parallel_region;
 
 ///
 /// Type aliases for omp reductions
diff --git a/include/RAJA/policy/openmp/reduce.hpp b/include/RAJA/policy/openmp/reduce.hpp
index 7ccc68c3a1..9fd8af9486 100644
--- a/include/RAJA/policy/openmp/reduce.hpp
+++ b/include/RAJA/policy/openmp/reduce.hpp
@@ -42,7 +42,7 @@ namespace RAJA
 
 namespace detail
 {
-template <typename T, typename Reduce>
+template<typename T, typename Reduce>
 class ReduceOMP
     : public reduce::detail::BaseCombinable<T, Reduce, ReduceOMP<T, Reduce>>
 {
@@ -55,7 +55,8 @@ class ReduceOMP
 
   ~ReduceOMP()
   {
-    if (Base::parent) {
+    if (Base::parent)
+    {
 #pragma omp critical(ompReduceCritical)
       Reduce()(Base::parent->local(), Base::my_data);
       Base::my_data = Base::identity;
@@ -75,7 +76,7 @@ RAJA_DECLARE_ALL_REDUCERS(omp_reduce, detail::ReduceOMP)
 
 namespace detail
 {
-template <typename T, typename Reduce>
+template<typename T, typename Reduce>
 class ReduceOMPOrdered
     : public reduce::detail::
           BaseCombinable<T, Reduce, ReduceOMPOrdered<T, Reduce>>
@@ -101,20 +102,22 @@ class ReduceOMPOrdered
 
   ~ReduceOMPOrdered()
   {
-    Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
     Base::my_data = Base::identity;
   }
 
   T get_combined() const
   {
-    if (Base::my_data != Base::identity) {
-      Reduce{}((*data)[omp_get_thread_num()], Base::my_data);
+    if (Base::my_data != Base::identity)
+    {
+      Reduce {}((*data)[omp_get_thread_num()], Base::my_data);
       Base::my_data = Base::identity;
     }
 
     T res = Base::identity;
-    for (size_t i = 0; i < data->size(); ++i) {
-      Reduce{}(res, (*data)[i]);
+    for (size_t i = 0; i < data->size(); ++i)
+    {
+      Reduce {}(res, (*data)[i]);
     }
     return res;
   }
diff --git a/include/RAJA/policy/openmp/region.hpp b/include/RAJA/policy/openmp/region.hpp
index 88f0519abf..708f2fc9d8 100644
--- a/include/RAJA/policy/openmp/region.hpp
+++ b/include/RAJA/policy/openmp/region.hpp
@@ -34,16 +34,16 @@ namespace omp
  *
  */
 
-template <typename Func>
-RAJA_INLINE void region_impl(const omp_parallel_region &, Func &&body)
+template<typename Func>
+RAJA_INLINE void region_impl(const omp_parallel_region&, Func&& body)
 {
 
 #pragma omp parallel
-    { // curly brackets to ensure body() is encapsulated in omp parallel region
-      //thread private copy of body
-      auto loopbody = body;
-      loopbody();
-    }
+  {  // curly brackets to ensure body() is encapsulated in omp parallel region
+    // thread private copy of body
+    auto loopbody = body;
+    loopbody();
+  }
 }
 
 }  // namespace omp
diff --git a/include/RAJA/policy/openmp/scan.hpp b/include/RAJA/policy/openmp/scan.hpp
index 97cd7a8ab8..b284127aac 100644
--- a/include/RAJA/policy/openmp/scan.hpp
+++ b/include/RAJA/policy/openmp/scan.hpp
@@ -43,40 +43,40 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename Policy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f)
+template<typename Policy, typename Iter, typename BinFn>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, Value());
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    if (idx_begin != idx_end) {
-      inclusive_inplace(host_res, ::RAJA::seq_exec{},
-                        begin + idx_begin, begin + idx_end, f);
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    if (idx_begin != idx_end)
+    {
+      inclusive_inplace(host_res, ::RAJA::seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, ::RAJA::seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, ::RAJA::seq_exec {}, sums.data(),
+                      sums.data() + p, f, BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -88,43 +88,43 @@ inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename Policy, typename Iter, typename BinFn, typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const Policy&,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    ValueT v)
+template<typename Policy, typename Iter, typename BinFn, typename ValueT>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive_inplace(resources::Host host_res,
+                  const Policy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  ValueT v)
 {
-  using std::distance;
   using RAJA::detail::firstIndex;
-  using Value = typename ::std::iterator_traits<Iter>::value_type;
-  const auto n = distance(begin, end);
+  using std::distance;
+  using Value     = typename ::std::iterator_traits<Iter>::value_type;
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
-  const int p0 = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
+  const int p0    = std::min(n, static_cast<DistanceT>(omp_get_max_threads()));
   ::std::vector<Value> sums(p0, v);
 #pragma omp parallel num_threads(p0)
   {
-    const int p = omp_get_num_threads();
-    const int pid = omp_get_thread_num();
+    const int p               = omp_get_num_threads();
+    const int pid             = omp_get_thread_num();
     const DistanceT idx_begin = firstIndex(n, p, pid);
-    const DistanceT idx_end = firstIndex(n, p, pid + 1);
-    const Value init = ((pid == 0) ? v : *(begin + idx_begin - 1));
+    const DistanceT idx_end   = firstIndex(n, p, pid + 1);
+    const Value init          = ((pid == 0) ? v : *(begin + idx_begin - 1));
 #pragma omp barrier
-    if (idx_begin != idx_end) {
-      exclusive_inplace(host_res, seq_exec{},
-                        begin + idx_begin, begin + idx_end, f, init);
+    if (idx_begin != idx_end)
+    {
+      exclusive_inplace(host_res, seq_exec {}, begin + idx_begin,
+                        begin + idx_end, f, init);
       sums[pid] = begin[idx_end - 1];
     }
 #pragma omp barrier
 #pragma omp single
-    exclusive_inplace(host_res, seq_exec{},
-                      sums.data(), sums.data() + p, f, BinFn::identity());
-    for (auto i = idx_begin; i < idx_end; ++i) {
+    exclusive_inplace(host_res, seq_exec {}, sums.data(), sums.data() + p, f,
+                      BinFn::identity());
+    for (auto i = idx_begin; i < idx_end; ++i)
+    {
       begin[i] = f(begin[i], sums[pid]);
     }
   }
@@ -136,17 +136,15 @@ exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename Policy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-inclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f)
+template<typename Policy, typename Iter, typename OutIter, typename BinFn>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+inclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f)
 {
   using std::distance;
   ::std::copy(begin, end, out);
@@ -157,26 +155,25 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename Policy,
-          typename Iter,
-          typename OutIter,
-          typename BinFn,
-          typename ValueT>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_openmp_policy<Policy>>
-exclusive(
-    resources::Host host_res,
-    const Policy& exec,
-    Iter begin,
-    Iter end,
-    OutIter out,
-    BinFn f,
-    ValueT v)
+template<typename Policy,
+         typename Iter,
+         typename OutIter,
+         typename BinFn,
+         typename ValueT>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_openmp_policy<Policy>>
+exclusive(resources::Host host_res,
+          const Policy& exec,
+          Iter begin,
+          Iter end,
+          OutIter out,
+          BinFn f,
+          ValueT v)
 {
   using std::distance;
   ::std::copy(begin, end, out);
-  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f, v);
+  return exclusive_inplace(host_res, exec, out, out + distance(begin, end), f,
+                           v);
 }
 
 }  // namespace scan
diff --git a/include/RAJA/policy/openmp/sort.hpp b/include/RAJA/policy/openmp/sort.hpp
index 9e4474d692..23980d821a 100644
--- a/include/RAJA/policy/openmp/sort.hpp
+++ b/include/RAJA/policy/openmp/sort.hpp
@@ -54,7 +54,7 @@ constexpr int get_min_iterates_per_task() { return 128; }
         \brief sort given range using sorter and comparison function
                by spawning tasks
 */
-template <typename Sorter, typename Iter, typename Compare>
+template<typename Sorter, typename Iter, typename Compare>
 inline void sort_task(Sorter sorter,
                       Iter begin,
                       RAJA::detail::IterDiff<Iter> i_begin,
@@ -62,16 +62,18 @@ inline void sort_task(Sorter sorter,
                       RAJA::detail::IterDiff<Iter> iterates_per_task,
                       Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type   = RAJA::detail::IterDiff<Iter>;
   const diff_type n = i_end - i_begin;
 
-  if (n <= iterates_per_task) {
-
-    sorter(begin+i_begin, begin+i_end, comp);
+  if (n <= iterates_per_task)
+  {
 
-  } else {
+    sorter(begin + i_begin, begin + i_end, comp);
+  }
+  else
+  {
 
-    const diff_type i_middle = i_begin + n/2;
+    const diff_type i_middle = i_begin + n / 2;
 
 #pragma omp task
     sort_task(sorter, begin, i_begin, i_middle, iterates_per_task, comp);
@@ -81,8 +83,10 @@ inline void sort_task(Sorter sorter,
 
 #pragma omp taskwait
 
-    //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+    // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+    // comp);
+    RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                begin + i_end, comp);
   }
 }
 
@@ -92,7 +96,7 @@ inline void sort_task(Sorter sorter,
         \brief sort given range using sorter and comparison function
                by manually assigning work to threads
 */
-template <typename Sorter, typename Iter, typename Compare>
+template<typename Sorter, typename Iter, typename Compare>
 inline void sort_parallel_region(Sorter sorter,
                                  Iter begin,
                                  RAJA::detail::IterDiff<Iter> n,
@@ -114,20 +118,27 @@ inline void sort_parallel_region(Sorter sorter,
   }
 
   // hierarchically merge ranges
-  for (diff_type middle_offset = 1; middle_offset < num_threads; middle_offset *= 2) {
+  for (diff_type middle_offset = 1; middle_offset < num_threads;
+       middle_offset *= 2)
+  {
 
-    diff_type end_offset = 2*middle_offset;
+    diff_type end_offset = 2 * middle_offset;
 
-    const diff_type i_middle = firstIndex(n, num_threads, std::min(thread_id + middle_offset, num_threads));
-    const diff_type i_end    = firstIndex(n, num_threads, std::min(thread_id + end_offset,    num_threads));
+    const diff_type i_middle = firstIndex(
+        n, num_threads, std::min(thread_id + middle_offset, num_threads));
+    const diff_type i_end = firstIndex(
+        n, num_threads, std::min(thread_id + end_offset, num_threads));
 
 #pragma omp barrier
 
-    if (thread_id % end_offset == 0) {
+    if (thread_id % end_offset == 0)
+    {
 
       // this thread merges ranges [i_begin, i_middle) and [i_middle, i_end)
-      //std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
-      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end, comp);
+      // std::inplace_merge(begin + i_begin, begin + i_middle, begin + i_end,
+      // comp);
+      RAJA::detail::inplace_merge(begin + i_begin, begin + i_middle,
+                                  begin + i_end, comp);
     }
   }
 }
@@ -138,12 +149,8 @@ inline void sort_parallel_region(Sorter sorter,
 /*!
         \brief sort given range using sorter and comparison function
 */
-template <typename Sorter, typename Iter, typename Compare>
-inline
-void sort(Sorter sorter,
-          Iter begin,
-          Iter end,
-          Compare comp)
+template<typename Sorter, typename Iter, typename Compare>
+inline void sort(Sorter sorter, Iter begin, Iter end, Compare comp)
 {
   using diff_type = RAJA::detail::IterDiff<Iter>;
 
@@ -151,20 +158,24 @@ void sort(Sorter sorter,
 
   const diff_type n = end - begin;
 
-  if (n <= min_iterates_per_task) {
+  if (n <= min_iterates_per_task)
+  {
 
     sorter(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     const diff_type max_threads = omp_get_max_threads();
 
 #if defined(RAJA_ENABLE_OPENMP_TASK_INTERNAL)
 
-    const diff_type iterates_per_task = std::max(n/(2*max_threads), min_iterates_per_task);
+    const diff_type iterates_per_task =
+        std::max(n / (2 * max_threads), min_iterates_per_task);
 
-    const diff_type requested_num_threads = std::min((n+iterates_per_task-1)/iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads =
+        std::min((n + iterates_per_task - 1) / iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
 #pragma omp master
@@ -174,8 +185,9 @@ void sort(Sorter sorter,
 
 #else
 
-    const diff_type requested_num_threads = std::min((n+min_iterates_per_task-1)/min_iterates_per_task, max_threads);
-    RAJA_UNUSED_VAR(requested_num_threads); // avoid warning in hip device code
+    const diff_type requested_num_threads = std::min(
+        (n + min_iterates_per_task - 1) / min_iterates_per_task, max_threads);
+    RAJA_UNUSED_VAR(requested_num_threads);  // avoid warning in hip device code
 
 #pragma omp parallel num_threads(static_cast<int>(requested_num_threads))
     {
@@ -186,24 +198,23 @@ void sort(Sorter sorter,
   }
 }
 
-} // namespace openmp
+}  // namespace openmp
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
 */
-template <typename ExecPolicy, typename Iter, typename Compare>
+template<typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -211,17 +222,16 @@ unstable(
 /*!
         \brief stable sort given range using comparison function
 */
-template <typename ExecPolicy, typename Iter, typename Compare>
+template<typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::openmp::sort(detail::StableSorter{}, begin, end, comp);
+  detail::openmp::sort(detail::StableSorter {}, begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -229,43 +239,50 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template<typename ExecPolicy,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::UnstableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::UnstableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template<typename ExecPolicy,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_openmp_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin  = RAJA::zip(keys_begin, vals_begin);
-  auto end    = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::openmp::sort(detail::StableSorter{}, begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::openmp::sort(detail::StableSorter {}, begin, end,
+                       RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/openmp_target.hpp b/include/RAJA/policy/openmp_target.hpp
index af88127636..018b3878d8 100644
--- a/include/RAJA/policy/openmp_target.hpp
+++ b/include/RAJA/policy/openmp_target.hpp
@@ -30,10 +30,11 @@
 #include "RAJA/policy/openmp_target/kernel.hpp"
 #include "RAJA/policy/openmp_target/forall.hpp"
 #include "RAJA/policy/openmp_target/reduce.hpp"
-//#include "RAJA/policy/openmp_target/multi_reduce.hpp"
+// #include "RAJA/policy/openmp_target/multi_reduce.hpp"
 #include "RAJA/policy/openmp_target/WorkGroup.hpp"
 
 
-#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) && defined(RAJA_ENABLE_TARGET_OPENMP)
+#endif  // closing endif for if defined(RAJA_ENABLE_OPENMP) &&
+        // defined(RAJA_ENABLE_TARGET_OPENMP)
 
 #endif  // closing endif for header file include guard
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
index a4a4a62903..ca638351b2 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/Dispatcher.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
-
 namespace RAJA
 {
 
@@ -36,12 +35,12 @@ namespace omp_target
 
 // create the value in a target region using the factory, map the value
 // back, and return the value created in the target region
-template < typename Factory >
+template<typename Factory>
 inline auto get_value(Factory factory)
 {
   typename std::decay_t<Factory>::value_type value;
 
-  #pragma omp target map(tofrom : value) map(to : factory)
+#pragma omp target map(tofrom : value) map(to : factory)
   {
     value = factory();
   }
@@ -51,7 +50,7 @@ inline auto get_value(Factory factory)
 
 // get the device value and store it so it can be used
 // multiple times
-template < typename Factory >
+template<typename Factory>
 inline auto get_cached_value(Factory&& factory)
 {
   static auto value = get_value(std::forward<Factory>(factory));
@@ -61,17 +60,17 @@ inline auto get_cached_value(Factory&& factory)
 }  // namespace omp_target
 
 /*!
-* Populate and return a Dispatcher object that can be used in omp target regions
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object that can be used in omp target
+ * regions
+ */
+template<typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(omp_target_work const&)
 {
-  static Dispatcher_T dispatcher{
-        Dispatcher_T::template makeDispatcher<T>(
-          [](auto&& factory) {
-            return omp_target::get_cached_value(
-                std::forward<decltype(factory)>(factory));
-          }) };
+  static Dispatcher_T dispatcher {
+      Dispatcher_T::template makeDispatcher<T>([](auto&& factory) {
+        return omp_target::get_cached_value(
+            std::forward<decltype(factory)>(factory));
+      })};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
index b373d09c61..b8acaf2bab 100644
--- a/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/openmp_target/WorkGroup/WorkRunner.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
 
-
 namespace RAJA
 {
 
@@ -35,51 +34,47 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallOrdered<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::omp_target_parallel_for_exec_nt,
-        RAJA::omp_target_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::omp_target_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...>
+    : WorkRunnerForallReverse<RAJA::omp_target_parallel_for_exec_nt,
+                              RAJA::omp_target_work,
+                              RAJA::reverse_ordered,
+                              DISPATCH_POLICY_T,
+                              ALLOCATOR_T,
+                              INDEX_T,
+                              Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/openmp_target/forall.hpp b/include/RAJA/policy/openmp_target/forall.hpp
index 061481cbc1..51629413e5 100644
--- a/include/RAJA/policy/openmp_target/forall.hpp
+++ b/include/RAJA/policy/openmp_target/forall.hpp
@@ -33,13 +33,15 @@ namespace omp
 /// OpenMP target parallel for policy implementation
 ///
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template<size_t ThreadsPerTeam,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>& p,
             Iterable&& iter,
@@ -51,33 +53,35 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it) reduction(combine: f_params)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to : body, begin_it) reduction(combine : f_params)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -86,13 +90,14 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <size_t ThreadsPerTeam, typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE 
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template<size_t ThreadsPerTeam,
+         typename Iterable,
+         typename Func,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec<ThreadsPerTeam>&,
             Iterable&& iter,
@@ -100,33 +105,35 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
   // Reset if exceed CUDA threads per block limit.
   int tperteam = ThreadsPerTeam;
-  if ( tperteam > omp::MAXNUMTHREADS )
+  if (tperteam > omp::MAXNUMTHREADS)
   {
     tperteam = omp::MAXNUMTHREADS;
   }
 
   // calculate number of teams based on user defined threads per team
   // datasize is distance between begin() and end() of iterable
-  auto numteams = RAJA_DIVIDE_CEILING_INT( distance_it, tperteam );
-  if ( numteams > tperteam )
+  auto numteams = RAJA_DIVIDE_CEILING_INT(distance_it, tperteam);
+  if (numteams > tperteam)
   {
     // Omp target reducers will write team # results, into Threads-sized array.
     // Need to insure NumTeams <= Threads to prevent array out of bounds access.
     numteams = tperteam;
   }
 
-// thread_limit(tperteam) unused due to XL seg fault (when tperteam != distance)
+  // thread_limit(tperteam) unused due to XL seg fault (when tperteam !=
+  // distance)
   auto i = distance_it;
 
-#pragma omp target teams distribute parallel for num_teams(numteams) \
-    schedule(static, 1) map(to : body,begin_it)
-  for (i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for num_teams(numteams)           \
+    schedule(static, 1) map(to : body, begin_it)
+  for (i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
@@ -134,17 +141,12 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-
-
-
-
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt& p,
             Iterable&& iter,
@@ -156,13 +158,14 @@ forall_impl(resources::Omp omp_res,
   RAJA_OMP_DECLARE_REDUCTION_COMBINE;
 
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it) reduction(combine: f_params)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it) reduction(combine : f_params)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     RAJA::expt::invoke_body(f_params, ib, begin_it[i]);
   }
@@ -171,13 +174,11 @@ forall_impl(resources::Omp omp_res,
   return resources::EventProxy<resources::Omp>(omp_res);
 }
 
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Omp>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Omp>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(resources::Omp omp_res,
             const omp_target_parallel_for_exec_nt&,
             Iterable&& iter,
@@ -185,13 +186,14 @@ forall_impl(resources::Omp omp_res,
             ForallParam)
 {
   using Body = typename std::remove_reference<decltype(loop_body)>::type;
-  Body body = loop_body;
+  Body body  = loop_body;
 
   RAJA_EXTRACT_BED_IT(iter);
 
-#pragma omp target teams distribute parallel for schedule(static, 1) \
-    firstprivate(body,begin_it)
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
+    firstprivate(body, begin_it)
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     Body ib = body;
     ib(begin_it[i]);
   }
diff --git a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
index b72147151c..dd7dd4695c 100644
--- a/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/Collapse.hpp
@@ -10,15 +10,21 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t Arg0, camp::idx_t Arg1, typename... EnclosedStmts, typename Types>
+template<camp::idx_t Arg0,
+         camp::idx_t Arg1,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
-  template <typename Data>
+  template<typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
     auto l0 = segment_length<Arg0>(data);
@@ -30,29 +36,33 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(2)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          auto& private_data = privatizer.get_priv();
-          private_data.template assign_offset<Arg0>(i0);
-          private_data.template assign_offset<Arg1>(i1);
-          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(private_data);
-        }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        auto& private_data = privatizer.get_priv();
+        private_data.template assign_offset<Arg0>(i0);
+        private_data.template assign_offset<Arg1>(i1);
+        execute_statement_list<camp::list<EnclosedStmts...>, NewTypes1>(
+            private_data);
       }
     }
+  }
 };
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t Arg0,
+         camp::idx_t Arg1,
+         camp::idx_t Arg2,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
-  template <typename Data>
+  template<typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
     auto l0 = segment_length<Arg0>(data);
@@ -66,33 +76,38 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(3)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            auto& private_data = privatizer.get_priv();
-            private_data.template assign_offset<Arg0>(i0);
-            private_data.template assign_offset<Arg1>(i1);
-            private_data.template assign_offset<Arg2>(i2);
-            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(private_data);
-          }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          auto& private_data = privatizer.get_priv();
+          private_data.template assign_offset<Arg0>(i0);
+          private_data.template assign_offset<Arg1>(i1);
+          private_data.template assign_offset<Arg2>(i2);
+          execute_statement_list<camp::list<EnclosedStmts...>, NewTypes2>(
+              private_data);
         }
       }
     }
+  }
 };
 
-template <camp::idx_t Arg0,
-          camp::idx_t Arg1,
-          camp::idx_t Arg2,
-          camp::idx_t Arg3,
-          typename... EnclosedStmts,
-          typename Types>
+template<camp::idx_t Arg0,
+         camp::idx_t Arg1,
+         camp::idx_t Arg2,
+         camp::idx_t Arg3,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
                                              ArgList<Arg0, Arg1, Arg2, Arg3>,
-                                             EnclosedStmts...>, Types>
+                                             EnclosedStmts...>,
+                         Types>
 {
-  template <typename Data>
+  template<typename Data>
   static RAJA_INLINE void exec(Data&& data)
   {
     auto l0 = segment_length<Arg0>(data);
@@ -108,26 +123,31 @@ struct StatementExecutor<statement::Collapse<omp_target_parallel_collapse_exec,
 
     using RAJA::internal::thread_privatize;
     auto privatizer = thread_privatize(data);
-#pragma omp target teams distribute parallel for schedule(static, 1) \
+#pragma omp target teams distribute parallel for schedule(static, 1)           \
     firstprivate(privatizer) collapse(4)
-      for (auto i0 = (decltype(l0))0; i0 < l0; ++i0) {
-        for (auto i1 = (decltype(l1))0; i1 < l1; ++i1) {
-          for (auto i2 = (decltype(l2))0; i2 < l2; ++i2) {
-            for (auto i3 = (decltype(l3))0; i3 < l3; ++i3) {
-              auto& private_data = privatizer.get_priv();
-              private_data.template assign_offset<Arg0>(i0);
-              private_data.template assign_offset<Arg1>(i1);
-              private_data.template assign_offset<Arg2>(i2);
-              private_data.template assign_offset<Arg3>(i2);
-              execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(private_data);
-            }
+    for (auto i0 = (decltype(l0))0; i0 < l0; ++i0)
+    {
+      for (auto i1 = (decltype(l1))0; i1 < l1; ++i1)
+      {
+        for (auto i2 = (decltype(l2))0; i2 < l2; ++i2)
+        {
+          for (auto i3 = (decltype(l3))0; i3 < l3; ++i3)
+          {
+            auto& private_data = privatizer.get_priv();
+            private_data.template assign_offset<Arg0>(i0);
+            private_data.template assign_offset<Arg1>(i1);
+            private_data.template assign_offset<Arg2>(i2);
+            private_data.template assign_offset<Arg3>(i2);
+            execute_statement_list<camp::list<EnclosedStmts...>, NewTypes3>(
+                private_data);
           }
         }
       }
     }
+  }
 };
 
-}
-}
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_kernel_Collapse_HPP
+#endif  // RAJA_policy_openmp_target_kernel_Collapse_HPP
diff --git a/include/RAJA/policy/openmp_target/kernel/For.hpp b/include/RAJA/policy/openmp_target/kernel/For.hpp
index 173230b9e2..882240969a 100644
--- a/include/RAJA/policy/openmp_target/kernel/For.hpp
+++ b/include/RAJA/policy/openmp_target/kernel/For.hpp
@@ -10,27 +10,34 @@
 
 #include "RAJA/pattern/kernel/internal.hpp"
 
-namespace RAJA {
-namespace internal {
+namespace RAJA
+{
+namespace internal
+{
 
-template <camp::idx_t ArgumentId, typename Data, typename Types, typename... EnclosedStmts>
-struct OpenMPTargetForWrapper : public GenericWrapperBase 
+template<camp::idx_t ArgumentId,
+         typename Data,
+         typename Types,
+         typename... EnclosedStmts>
+struct OpenMPTargetForWrapper : public GenericWrapperBase
 {
   using data_t = camp::decay<Data>;
 
   data_t data;
 
-  /*! 
+  /*!
    * \brief Deferences data so that it can be mapped to the device
    */
   RAJA_INLINE
-  constexpr explicit OpenMPTargetForWrapper(data_t &d) : 
-    data{d}  {}
+  constexpr explicit OpenMPTargetForWrapper(data_t& d) : data {d} {}
 
   RAJA_INLINE
-  void exec() { execute_statement_list<camp::list<EnclosedStmts...>, Types>(data); }
+  void exec()
+  {
+    execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
+  }
 
-  template <typename InIndexType>
+  template<typename InIndexType>
   RAJA_INLINE void operator()(InIndexType i)
   {
     data.template assign_offset<ArgumentId>(i);
@@ -38,32 +45,37 @@ struct OpenMPTargetForWrapper : public GenericWrapperBase
   }
 };
 
-template <camp::idx_t ArgumentId,
-          int N,
-          typename... EnclosedStmts,
-          typename Types>
-struct StatementExecutor<statement::For<ArgumentId, omp_target_parallel_for_exec<N>, EnclosedStmts...>, Types>
+template<camp::idx_t ArgumentId,
+         int N,
+         typename... EnclosedStmts,
+         typename Types>
+struct StatementExecutor<statement::For<ArgumentId,
+                                        omp_target_parallel_for_exec<N>,
+                                        EnclosedStmts...>,
+                         Types>
 {
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...> for_wrapper(data);
+    OpenMPTargetForWrapper<ArgumentId, Data, NewTypes, EnclosedStmts...>
+        for_wrapper(data);
 
-    auto len = segment_length<ArgumentId>(data);
+    auto len    = segment_length<ArgumentId>(data);
     using len_t = decltype(len);
 
     auto r = resources::Omp::get_default();
-    forall_impl(r, omp_target_parallel_for_exec<N>{}, TypedRangeSegment<len_t>(0, len), for_wrapper, RAJA::expt::get_empty_forall_param_pack());
+    forall_impl(r, omp_target_parallel_for_exec<N> {},
+                TypedRangeSegment<len_t>(0, len), for_wrapper,
+                RAJA::expt::get_empty_forall_param_pack());
   }
 };
 
 
+}  // namespace internal
+}  // namespace RAJA
 
-}
-}
-
-#endif // RAJA_policy_openmp_kernel_For_HPP
+#endif  // RAJA_policy_openmp_kernel_For_HPP
diff --git a/include/RAJA/policy/openmp_target/params/kernel_name.hpp b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
index 5e9edb4b6c..e5dcdb0dcf 100644
--- a/include/RAJA/policy/openmp_target/params/kernel_name.hpp
+++ b/include/RAJA/policy/openmp_target/params/kernel_name.hpp
@@ -3,38 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(KernelName&, T& /*place holder argument*/) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>> init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template<typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(KernelName&, T& /*place holder argument*/)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(KernelName&)
+{
+  // TODO: Define kernel naming
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/params/reduce.hpp b/include/RAJA/policy/openmp_target/params/reduce.hpp
index 34c23fb5db..41c046b846 100644
--- a/include/RAJA/policy/openmp_target/params/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/params/reduce.hpp
@@ -3,37 +3,43 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_target_openmp_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>> init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_target_openmp_policy<EXEC_POL>>
+resolve(Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_OMP_REDUCE_HPP
+#endif  //  NEW_REDUCE_OMP_REDUCE_HPP
diff --git a/include/RAJA/policy/openmp_target/policy.hpp b/include/RAJA/policy/openmp_target/policy.hpp
index 520f5afc55..e759552cf9 100644
--- a/include/RAJA/policy/openmp_target/policy.hpp
+++ b/include/RAJA/policy/openmp_target/policy.hpp
@@ -10,58 +10,62 @@
 
 #include "RAJA/policy/PolicyBase.hpp"
 
-namespace RAJA {
+namespace RAJA
+{
 
-namespace policy {
-namespace omp {
+namespace policy
+{
+namespace omp
+{
 
 // Max number of CUDA reduction threads per block possible.
 // Required for allocating omp target data before execution policy.
 // Used in target_parallel_for, aliased in target_reduce.
 static constexpr int MAXNUMTHREADS = 1024;
 
-template <unsigned int TeamSize>
-struct Teams : std::integral_constant<unsigned int, TeamSize> {
-};
+template<unsigned int TeamSize>
+struct Teams : std::integral_constant<unsigned int, TeamSize>
+{};
 
-struct Target {
-};
+struct Target
+{};
 
-struct Distribute {
-};
+struct Distribute
+{};
 
-struct Collapse {
-};
+struct Collapse
+{};
 
-template <size_t ThreadsPerTeam>
+template<size_t ThreadsPerTeam>
 struct omp_target_parallel_for_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Teams<ThreadsPerTeam>,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Teams<ThreadsPerTeam>,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_for_exec_nt
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Distribute> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Distribute>
+{};
 
 struct omp_target_parallel_collapse_exec
     : make_policy_pattern_platform_t<Policy::target_openmp,
-                            Pattern::forall,
-                            Platform::omp_target,
-                            omp::Target,
-                            omp::Collapse> {
-};
+                                     Pattern::forall,
+                                     Platform::omp_target,
+                                     omp::Target,
+                                     omp::Collapse>
+{};
 
-struct omp_target_reduce
-    : make_policy_pattern_platform_t<Policy::target_openmp, Pattern::reduce, Platform::omp_target> {
-};
+struct omp_target_reduce : make_policy_pattern_platform_t<Policy::target_openmp,
+                                                          Pattern::reduce,
+                                                          Platform::omp_target>
+{};
 
 ///
 /// WorkGroup execution policies
@@ -70,21 +74,21 @@ struct omp_target_work
     : make_policy_pattern_launch_platform_t<Policy::target_openmp,
                                             Pattern::workgroup_exec,
                                             Launch::sync,
-                                            Platform::omp_target> {
-};
+                                            Platform::omp_target>
+{};
 
 
-}  // closing brace for omp namespace
-}  // closing brace for policy namespace
+}  // namespace omp
+}  // namespace policy
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
+using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_parallel_for_exec;
 using policy::omp::omp_target_parallel_for_exec_nt;
 using policy::omp::omp_target_reduce;
-using policy::omp::omp_target_parallel_collapse_exec;
 using policy::omp::omp_target_work;
 #endif
 
-} // closing brace for RAJA namespace
+}  // namespace RAJA
 
-#endif // RAJA_policy_openmp_target_HPP
+#endif  // RAJA_policy_openmp_target_HPP
diff --git a/include/RAJA/policy/openmp_target/reduce.hpp b/include/RAJA/policy/openmp_target/reduce.hpp
index 6691729bbe..8dc4104473 100644
--- a/include/RAJA/policy/openmp_target/reduce.hpp
+++ b/include/RAJA/policy/openmp_target/reduce.hpp
@@ -12,7 +12,7 @@
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
 
-//#include <cassert>  // Leaving out until XL is fixed 2/25/2019.
+// #include <cassert>  // Leaving out until XL is fixed 2/25/2019.
 
 #include <algorithm>
 
@@ -24,7 +24,6 @@
 
 #include "RAJA/policy/openmp/policy.hpp"
 
-
 namespace RAJA
 {
 
@@ -32,37 +31,42 @@ namespace omp
 {
 #pragma omp declare target
 
-template <typename T, typename I>
-struct minloc 
+template<typename T, typename I>
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
+
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
+                                               I& loc,
                                                const T v,
                                                const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
   }
 };
 
-template <typename T, typename I>
-struct maxloc 
+template<typename T, typename I>
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
+
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
+                                               I& loc,
                                                const T v,
                                                const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
   }
 };
+
 #pragma omp end declare target
 
 // Alias for clarity. Reduction size operates on number of omp teams.
@@ -70,28 +74,29 @@ struct maxloc
 static constexpr int MaxNumTeams = policy::omp::MAXNUMTHREADS;
 
 //! Information necessary for OpenMP offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{omp_get_initial_device()};
-  int deviceID{omp_get_default_device()};
-  bool isMapped{false};
+  int hostID {omp_get_initial_device()};
+  int deviceID {omp_get_default_device()};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for OpenMP Offload -- stores value, host pointer, and device
 //! pointer
-template <typename T>
+template<typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -100,17 +105,19 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
-     : value(initValue),
-        device{reinterpret_cast<T *>(
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
+      : value(initValue),
+        device {reinterpret_cast<T*>(
             omp_target_alloc(omp::MaxNumTeams * sizeof(T), info.deviceID))},
-        host{new T[omp::MaxNumTeams]}
+        host {new T[omp::MaxNumTeams]}
   {
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -118,55 +125,49 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
-
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(device),
-                          reinterpret_cast<void *>(host),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.deviceID,
-                          info.hostID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(device),
+                          reinterpret_cast<void*>(host),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.deviceID,
+                          info.hostID) != 0)
+    {
       printf("Unable to copy memory from host to device\n");
       exit(1);
     }
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     // precondition: host and device are valid pointers
-    if (omp_target_memcpy(reinterpret_cast<void *>(host),
-                          reinterpret_cast<void *>(device),
-                          omp::MaxNumTeams * sizeof(T),
-                          0,
-                          0,
-                          info.hostID,
-                          info.deviceID) != 0) {
+    if (omp_target_memcpy(reinterpret_cast<void*>(host),
+                          reinterpret_cast<void*>(device),
+                          omp::MaxNumTeams * sizeof(T), 0, 0, info.hostID,
+                          info.deviceID) != 0)
+    {
       printf("Unable to copy memory from device to host\n");
       exit(1);
     }
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
-    if (device) {
-      omp_target_free(reinterpret_cast<void *>(device), info.deviceID);
+    if (device)
+    {
+      omp_target_free(reinterpret_cast<void*>(device), info.deviceID);
       device = nullptr;
     }
-    if (host) {
+    if (host)
+    {
       delete[] host;
       host = nullptr;
     }
@@ -177,78 +178,82 @@ struct Reduce_Data
 
 //! OpenMP Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
-template <typename Reducer, typename T>
-struct TargetReduce 
+template<typename Reducer, typename T>
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val_, T identity_ = Reducer::identity())
       : info(),
         val(identity_, identity_, info),
         initVal(init_val_),
         finalVal(identity_)
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     operator T();
     val.reset(identity_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_;
   }
 
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp declare target
 #endif
   //! apply reduction on device upon destruction
   ~TargetReduce()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], val.value);
+        Reducer {}(val.device[tid], val.value);
       }
     }
   }
-#ifdef __ibmxl__ // TODO: implicit declare target doesn't pick this up
+#ifdef __ibmxl__  // TODO: implicit declare target doesn't pick this up
 #pragma omp end declare target
 #endif
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
 
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
       val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     return finalVal;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
   }
 
@@ -263,14 +268,18 @@ struct TargetReduce
 
 //! OpenMP Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
-template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+template<typename Reducer, typename T, typename IndexType>
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val_, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+
+  explicit TargetReduceLoc(
+      T init_val_,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -278,31 +287,34 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     operator T();
     val.reset(identity_val_);
     loc.reset(identity_loc_);
-    initVal = init_val_;
+    initVal  = init_val_;
     finalVal = identity_val_;
-    initLoc = init_loc_;
+    initLoc  = init_loc_;
     finalLoc = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
   ~TargetReduceLoc()
   {
-    //assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until XL is fixed 2/25/2019.
-    if (!omp_is_initial_device()) {
+    // assert ( omp_get_num_teams() <= omp::MaxNumTeams );  // Leaving out until
+    // XL is fixed 2/25/2019.
+    if (!omp_is_initial_device())
+    {
 #pragma omp critical
       {
         int tid = omp_get_team_num();
-        Reducer{}(val.device[tid], loc.device[tid], val.value, loc.value);
+        Reducer {}(val.device[tid], loc.device[tid], val.value, loc.value);
       }
     }
   }
@@ -310,11 +322,13 @@ struct TargetReduceLoc
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      for (int i = 0; i < omp::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+      for (int i = 0; i < omp::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       val.cleanup(info);
       loc.cleanup(info);
@@ -322,10 +336,11 @@ struct TargetReduceLoc
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     return finalVal;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
@@ -339,16 +354,16 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -365,27 +380,25 @@ struct TargetReduceLoc
   IndexType finalLoc;
 };
 
-
 //! specialization of ReduceSum for omp_target_reduce
-template <typename T>
+template<typename T>
 class ReduceSum<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<omp_target_reduce, T>;
+  using self   = ReduceSum<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -393,25 +406,24 @@ class ReduceSum<omp_target_reduce, T>
 };
 
 //! specialization of ReduceBitOr for omp_target_reduce
-template <typename T>
+template<typename T>
 class ReduceBitOr<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<omp_target_reduce, T>;
+  using self   = ReduceBitOr<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -419,25 +431,24 @@ class ReduceBitOr<omp_target_reduce, T>
 };
 
 //! specialization of ReduceBitAnd for omp_target_reduce
-template <typename T>
+template<typename T>
 class ReduceBitAnd<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<omp_target_reduce, T>;
+  using self   = ReduceBitAnd<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -445,52 +456,49 @@ class ReduceBitAnd<omp_target_reduce, T>
 };
 
 //! specialization of ReduceMin for omp_target_reduce
-template <typename T>
+template<typename T>
 class ReduceMin<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<omp_target_reduce, T>;
+  using self   = ReduceMin<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
   }
 };
 
-
 //! specialization of ReduceMax for omp_target_reduce
-template <typename T>
+template<typename T>
 class ReduceMax<omp_target_reduce, T>
     : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<omp_target_reduce, T>;
+  using self   = ReduceMax<omp_target_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
     parent::reduce(rhsVal);
     return *this;
@@ -498,54 +506,49 @@ class ReduceMax<omp_target_reduce, T>
 };
 
 //! specialization of ReduceMinLoc for omp_target_reduce
-template <typename T, typename IndexType>
+template<typename T, typename IndexType>
 class ReduceMinLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMinLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMinLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::minloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  self &minloc(T rhsVal, IndexType rhsLoc)
+  self& minloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable minloc() for ReduceMinLoc -- alias for reduce()
-  const self &minloc(T rhsVal, IndexType rhsLoc) const
+  const self& minloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 };
 
-
 //! specialization of ReduceMaxLoc for omp_target_reduce
-template <typename T, typename IndexType>
+template<typename T, typename IndexType>
 class ReduceMaxLoc<omp_target_reduce, T, IndexType>
     : public TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>
 {
 public:
-
-  using self = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
-  using parent =
-      TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
+  using self   = ReduceMaxLoc<omp_target_reduce, T, IndexType>;
+  using parent = TargetReduceLoc<omp::maxloc<T, IndexType>, T, IndexType>;
   using parent::parent;
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  self &maxloc(T rhsVal, IndexType rhsLoc)
+  self& maxloc(T rhsVal, IndexType rhsLoc)
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
   }
 
   //! enable maxloc() for ReduceMaxLoc -- alias for reduce()
-  const self &maxloc(T rhsVal, IndexType rhsLoc) const
+  const self& maxloc(T rhsVal, IndexType rhsLoc) const
   {
     parent::reduce(rhsVal, rhsLoc);
     return *this;
diff --git a/include/RAJA/policy/sequential.hpp b/include/RAJA/policy/sequential.hpp
index 0963b31a01..90c6cb85ed 100644
--- a/include/RAJA/policy/sequential.hpp
+++ b/include/RAJA/policy/sequential.hpp
@@ -21,7 +21,7 @@
 #define RAJA_sequential_HPP
 
 #if !defined(RAJA_ENABLE_DESUL_ATOMICS)
-    #include "RAJA/policy/sequential/atomic.hpp"
+#include "RAJA/policy/sequential/atomic.hpp"
 #endif
 
 #include "RAJA/policy/sequential/forall.hpp"
diff --git a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
index 13796fd8a3..e8a97fafa8 100644
--- a/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/Dispatcher.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/WorkGroup/Dispatcher.hpp"
 
-
 namespace RAJA
 {
 
@@ -32,12 +31,12 @@ namespace detail
 {
 
 /*!
-* Populate and return a Dispatcher object
-*/
-template < typename T, typename Dispatcher_T >
+ * Populate and return a Dispatcher object
+ */
+template<typename T, typename Dispatcher_T>
 inline const Dispatcher_T* get_Dispatcher(seq_work const&)
 {
-  static Dispatcher_T dispatcher{ Dispatcher_T::template makeDispatcher<T>() };
+  static Dispatcher_T dispatcher {Dispatcher_T::template makeDispatcher<T>()};
   return &dispatcher;
 }
 
diff --git a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
index 31e401bf88..1b1c6c8d78 100644
--- a/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
+++ b/include/RAJA/policy/sequential/WorkGroup/WorkRunner.hpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/pattern/WorkGroup/WorkRunner.hpp"
 
-
 namespace RAJA
 {
 
@@ -35,51 +34,45 @@ namespace detail
  * Runs work in a storage container in order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallOrdered<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallOrdered<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 /*!
  * Runs work in a storage container in reverse order
  * and returns any per run resources
  */
-template <typename DISPATCH_POLICY_T,
-          typename ALLOCATOR_T,
-          typename INDEX_T,
-          typename ... Args>
-struct WorkRunner<
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-    : WorkRunnerForallReverse<
-        RAJA::seq_exec,
-        RAJA::seq_work,
-        RAJA::reverse_ordered,
-        DISPATCH_POLICY_T,
-        ALLOCATOR_T,
-        INDEX_T,
-        Args...>
-{ };
+template<typename DISPATCH_POLICY_T,
+         typename ALLOCATOR_T,
+         typename INDEX_T,
+         typename... Args>
+struct WorkRunner<RAJA::seq_work,
+                  RAJA::reverse_ordered,
+                  DISPATCH_POLICY_T,
+                  ALLOCATOR_T,
+                  INDEX_T,
+                  Args...> : WorkRunnerForallReverse<RAJA::seq_exec,
+                                                     RAJA::seq_work,
+                                                     RAJA::reverse_ordered,
+                                                     DISPATCH_POLICY_T,
+                                                     ALLOCATOR_T,
+                                                     INDEX_T,
+                                                     Args...>
+{};
 
 }  // namespace detail
 
diff --git a/include/RAJA/policy/sequential/atomic.hpp b/include/RAJA/policy/sequential/atomic.hpp
index 046e52e1c1..2b2aeaef8e 100644
--- a/include/RAJA/policy/sequential/atomic.hpp
+++ b/include/RAJA/policy/sequential/atomic.hpp
@@ -26,68 +26,58 @@ namespace RAJA
 {
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicLoad(seq_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicLoad(seq_atomic, T* acc)
 {
   return *acc;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE void atomicStore(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE void atomicStore(seq_atomic, T* acc, T value)
 {
   *acc = value;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAdd(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAdd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc += value;
   return ret;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicSub(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicSub(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc -= value;
   return ret;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMin(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMin(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = ret < value ? ret : value;
+  *acc  = ret < value ? ret : value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicMax(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicMax(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value < ret ? ret : value;
+  *acc  = value < ret ? ret : value;
   return ret;
 }
 
-
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) += T(1);
@@ -95,19 +85,17 @@ RAJA_INLINE T atomicInc(seq_atomic, T *acc)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicInc(seq_atomic, T *acc, T val)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicInc(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = val <= old ? T(0) : old + T(1);
+  *acc  = val <= old ? T(0) : old + T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc)
 {
   T ret = *acc;
   (*acc) -= T(1);
@@ -115,19 +103,17 @@ RAJA_INLINE T atomicDec(seq_atomic, T *acc)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicDec(seq_atomic, T *acc, T val)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicDec(seq_atomic, T* acc, T val)
 {
   T old = *acc;
-  *acc = old == T(0) || val < old ? val : old - T(1);
+  *acc  = old == T(0) || val < old ? val : old - T(1);
   return old;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicAnd(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc &= value;
@@ -135,9 +121,8 @@ RAJA_INLINE T atomicAnd(seq_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicOr(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc |= value;
@@ -145,9 +130,8 @@ RAJA_INLINE T atomicOr(seq_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicXor(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
   *acc ^= value;
@@ -155,22 +139,20 @@ RAJA_INLINE T atomicXor(seq_atomic, T *acc, T value)
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicExchange(seq_atomic, T *acc, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicExchange(seq_atomic, T* acc, T value)
 {
   T ret = *acc;
-  *acc = value;
+  *acc  = value;
   return ret;
 }
 
 RAJA_SUPPRESS_HD_WARN
-template <typename T>
-RAJA_HOST_DEVICE
-RAJA_INLINE T atomicCAS(seq_atomic, T *acc, T compare, T value)
+template<typename T>
+RAJA_HOST_DEVICE RAJA_INLINE T atomicCAS(seq_atomic, T* acc, T compare, T value)
 {
   T ret = *acc;
-  *acc = ret == compare ? value : ret;
+  *acc  = ret == compare ? value : ret;
   return ret;
 }
 
diff --git a/include/RAJA/policy/sequential/forall.hpp b/include/RAJA/policy/sequential/forall.hpp
index 5d1d6d84b0..50493bafe2 100644
--- a/include/RAJA/policy/sequential/forall.hpp
+++ b/include/RAJA/policy/sequential/forall.hpp
@@ -55,24 +55,26 @@ namespace sequential
 //////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template<typename Iterable,
+         typename Func,
+         typename Resource,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     expt::invoke_body(f_params, body, *(begin_it + i));
   }
 
@@ -80,22 +82,24 @@ forall_impl(Resource res,
   return resources::EventProxy<Resource>(res);
 }
 
-template <typename Iterable, typename Func, typename Resource, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<Resource>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template<typename Iterable,
+         typename Func,
+         typename Resource,
+         typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<Resource>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(Resource res,
-            const seq_exec &,
-            Iterable &&iter,
-            Func &&body,
+            const seq_exec&,
+            Iterable&& iter,
+            Func&& body,
             ForallParam)
 {
   RAJA_EXTRACT_BED_IT(iter);
 
-  for (decltype(distance_it) i = 0; i < distance_it; ++i) {
+  for (decltype(distance_it) i = 0; i < distance_it; ++i)
+  {
     body(*(begin_it + i));
   }
   return resources::EventProxy<Resource>(res);
diff --git a/include/RAJA/policy/sequential/kernel/Collapse.hpp b/include/RAJA/policy/sequential/kernel/Collapse.hpp
index 8e600ec2e8..1136c50a82 100644
--- a/include/RAJA/policy/sequential/kernel/Collapse.hpp
+++ b/include/RAJA/policy/sequential/kernel/Collapse.hpp
@@ -30,30 +30,35 @@ namespace internal
 //
 // Termination case for seq_exec collapsed loops
 //
-template <typename... EnclosedStmts, typename Types>
+template<typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>, Types> {
+    statement::Collapse<seq_exec, ArgList<>, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
     // termination case: no more loops, just execute enclosed statements
     execute_statement_list<camp::list<EnclosedStmts...>, Types>(data);
   }
 };
 
-
 //
 // Executor that handles collapsing of an arbitrarily deep set of seq_exec
 // loops
 //
-template <camp::idx_t Arg0, camp::idx_t... ArgRest, typename... EnclosedStmts, typename Types>
-struct StatementExecutor<statement::Collapse<seq_exec,
-                                             ArgList<Arg0, ArgRest...>,
-                                             EnclosedStmts...>, Types> {
+template<camp::idx_t Arg0,
+         camp::idx_t... ArgRest,
+         typename... EnclosedStmts,
+         typename Types>
+struct StatementExecutor<
+    statement::Collapse<seq_exec, ArgList<Arg0, ArgRest...>, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data& data)
   {
 
     // Set the argument type for this loop
@@ -61,11 +66,13 @@ struct StatementExecutor<statement::Collapse<seq_exec,
 
     // compute next-most inner loop Executor
     using next_loop_t = StatementExecutor<
-        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>, NewTypes>;
+        statement::Collapse<seq_exec, ArgList<ArgRest...>, EnclosedStmts...>,
+        NewTypes>;
 
     auto len0 = segment_length<Arg0>(data);
 
-    for (auto i0 = 0; i0 < len0; ++i0) {
+    for (auto i0 = 0; i0 < len0; ++i0)
+    {
       data.template assign_offset<Arg0>(i0);
 
       next_loop_t::exec(data);
diff --git a/include/RAJA/policy/sequential/kernel/Reduce.hpp b/include/RAJA/policy/sequential/kernel/Reduce.hpp
index 7280844320..d98b22047b 100644
--- a/include/RAJA/policy/sequential/kernel/Reduce.hpp
+++ b/include/RAJA/policy/sequential/kernel/Reduce.hpp
@@ -29,15 +29,17 @@ namespace internal
 //
 // Executor that handles reductions for
 //
-template <template <typename...> class ReduceOperator,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<template<typename...> class ReduceOperator,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>, Types> {
+    statement::Reduce<seq_reduce, ReduceOperator, ParamId, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
     // since a sequential reduction is a NOP, and the single thread always
     // has the reduced value, this is just a passthrough to the enclosed
diff --git a/include/RAJA/policy/sequential/launch.hpp b/include/RAJA/policy/sequential/launch.hpp
index a2025a71d5..12aeb010cd 100644
--- a/include/RAJA/policy/sequential/launch.hpp
+++ b/include/RAJA/policy/sequential/launch.hpp
@@ -25,9 +25,10 @@
 namespace RAJA
 {
 
-template <>
-struct LaunchExecute<RAJA::null_launch_t> {
-  template <typename BODY>
+template<>
+struct LaunchExecute<RAJA::null_launch_t>
+{
+  template<typename BODY>
   static void exec(LaunchContext const& RAJA_UNUSED_ARG(ctx),
                    BODY const& RAJA_UNUSED_ARG(body))
   {
@@ -35,22 +36,26 @@ struct LaunchExecute<RAJA::null_launch_t> {
   }
 };
 
+template<>
+struct LaunchExecute<RAJA::seq_launch_t>
+{
 
-template <>
-struct LaunchExecute<RAJA::seq_launch_t> {
-
-  template <typename BODY, typename ReduceParams>
-  static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                               RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                               RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, LaunchParams const &params, const char *RAJA_UNUSED_ARG(kernel_name),
-       BODY const &body, ReduceParams &RAJA_UNUSED_ARG(ReduceParams))
+  template<typename BODY, typename ReduceParams>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& RAJA_UNUSED_ARG(ReduceParams))
   {
 
     LaunchContext ctx;
 
-    char *kernel_local_mem = new char[params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     body(ctx);
 
@@ -61,17 +66,22 @@ struct LaunchExecute<RAJA::seq_launch_t> {
   }
 
   template<typename BODY, typename ReduceParams>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, LaunchParams const &launch_params,
-       const char *RAJA_UNUSED_ARG(kernel_name), BODY const &body, ReduceParams &launch_reducers)
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       LaunchParams const& launch_params,
+       const char* RAJA_UNUSED_ARG(kernel_name),
+       BODY const& body,
+       ReduceParams& launch_reducers)
   {
     expt::ParamMultiplexer::init<seq_exec>(launch_reducers);
 
     LaunchContext ctx;
-    char *kernel_local_mem = new char[launch_params.shared_mem_size];
-    ctx.shared_mem_ptr = kernel_local_mem;
+    char* kernel_local_mem = new char[launch_params.shared_mem_size];
+    ctx.shared_mem_ptr     = kernel_local_mem;
 
     expt::invoke_body(launch_reducers, body, ctx);
 
@@ -82,67 +92,69 @@ struct LaunchExecute<RAJA::seq_launch_t> {
 
     return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
-
-template <typename SEGMENT>
-struct LoopExecute<seq_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopExecute<seq_exec, SEGMENT>
+{
 
   RAJA_SUPPRESS_HD_WARN
-  template <typename BODY>
-  static RAJA_INLINE RAJA_HOST_DEVICE void exec(
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_HOST_DEVICE void exec(SEGMENT const& segment,
+                                                BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
 
       body(*(segment.begin() + i));
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j));
       }
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     // block stride loop
@@ -150,62 +162,66 @@ struct LoopExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k));
         }
       }
     }
   }
-
 };
 
+template<typename SEGMENT>
+struct LoopICountExecute<seq_exec, SEGMENT>
+{
 
-template <typename SEGMENT>
-struct LoopICountExecute<seq_exec, SEGMENT> {
-
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
     const int len = segment.end() - segment.begin();
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
 
-    template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      BODY const& body)
   {
 
     // block stride loop
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int j = 0; j < len1; j++) {
-      for (int i = 0; i < len0; i++) {
+    for (int j = 0; j < len1; j++)
+    {
+      for (int i = 0; i < len0; i++)
+      {
 
         body(*(segment0.begin() + i), *(segment1.begin() + j), i, j);
       }
     }
   }
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+      SEGMENT const& segment0,
+      SEGMENT const& segment1,
+      SEGMENT const& segment2,
+      BODY const& body)
   {
 
     // block stride loop
@@ -213,30 +229,32 @@ struct LoopICountExecute<seq_exec, SEGMENT> {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int k = 0; k < len2; k++) {
-      for (int j = 0; j < len1; j++) {
-        for (int i = 0; i < len0; i++) {
-          body(*(segment0.begin() + i),
-               *(segment1.begin() + j),
+    for (int k = 0; k < len2; k++)
+    {
+      for (int j = 0; j < len1; j++)
+      {
+        for (int i = 0; i < len0; i++)
+        {
+          body(*(segment0.begin() + i), *(segment1.begin() + j),
                *(segment2.begin() + k), i, j, k);
         }
       }
     }
   }
-
 };
 
-//Tile Execute + variants
+// Tile Execute + variants
 
-template <typename SEGMENT>
-struct TileExecute<seq_exec, SEGMENT> {
+template<typename SEGMENT>
+struct TileExecute<seq_exec, SEGMENT>
+{
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_HOST_DEVICE RAJA_INLINE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -246,28 +264,27 @@ struct TileExecute<seq_exec, SEGMENT> {
       body(segment.slice(tx, tile_size));
     }
   }
-
 };
 
-template <typename SEGMENT>
-struct TileTCountExecute<seq_exec, SEGMENT> {
+template<typename SEGMENT>
+struct TileTCountExecute<seq_exec, SEGMENT>
+{
 
-  template <typename TILE_T, typename BODY>
+  template<typename TILE_T, typename BODY>
   static RAJA_HOST_DEVICE RAJA_INLINE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
       TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = 0, bx=0; tx < len; tx += tile_size, bx++)
+    for (int tx = 0, bx = 0; tx < len; tx += tile_size, bx++)
     {
       body(segment.slice(tx, tile_size), bx);
     }
   }
-
 };
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/multi_reduce.hpp b/include/RAJA/policy/sequential/multi_reduce.hpp
index be3a3860f8..724a7963fa 100644
--- a/include/RAJA/policy/sequential/multi_reduce.hpp
+++ b/include/RAJA/policy/sequential/multi_reduce.hpp
@@ -47,7 +47,7 @@ namespace detail
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp, typename tuning >
+template<typename T, typename t_MultiReduceOp, typename tuning>
 struct MultiReduceDataSeq;
 
 /*!
@@ -59,59 +59,68 @@ struct MultiReduceDataSeq;
  *
  **************************************************************************
  */
-template < typename T, typename t_MultiReduceOp >
-struct MultiReduceDataSeq<T, t_MultiReduceOp,
+template<typename T, typename t_MultiReduceOp>
+struct MultiReduceDataSeq<
+    T,
+    t_MultiReduceOp,
     RAJA::sequential::MultiReduceTuning<
-      RAJA::sequential::multi_reduce_algorithm::left_fold>>
+        RAJA::sequential::multi_reduce_algorithm::left_fold>>
 {
-  using value_type = T;
+  using value_type    = T;
   using MultiReduceOp = t_MultiReduceOp;
 
   MultiReduceDataSeq() = delete;
 
-  template < typename Container,
-             std::enable_if_t<!std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr >
+  template<typename Container,
+           std::enable_if_t<
+               !std::is_same<Container, MultiReduceDataSeq>::value>* = nullptr>
   MultiReduceDataSeq(Container const& container, T identity)
-      : m_parent(nullptr)
-      , m_num_bins(container.size())
-      , m_identity(identity)
-      , m_data(nullptr)
+      : m_parent(nullptr),
+        m_num_bins(container.size()),
+        m_identity(identity),
+        m_data(nullptr)
   {
     m_data = create_data(container, m_num_bins);
   }
 
-  MultiReduceDataSeq(MultiReduceDataSeq const &other)
-      : m_parent(other.m_parent ? other.m_parent : &other)
-      , m_num_bins(other.m_num_bins)
-      , m_identity(other.m_identity)
-      , m_data(other.m_data)
-  { }
+  MultiReduceDataSeq(MultiReduceDataSeq const& other)
+      : m_parent(other.m_parent ? other.m_parent : &other),
+        m_num_bins(other.m_num_bins),
+        m_identity(other.m_identity),
+        m_data(other.m_data)
+  {}
 
-  MultiReduceDataSeq(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq(MultiReduceDataSeq&&)                 = delete;
   MultiReduceDataSeq& operator=(MultiReduceDataSeq const&) = delete;
-  MultiReduceDataSeq& operator=(MultiReduceDataSeq &&) = delete;
+  MultiReduceDataSeq& operator=(MultiReduceDataSeq&&)      = delete;
 
   ~MultiReduceDataSeq()
   {
-    if (m_data) {
-      if (!m_parent) {
+    if (m_data)
+    {
+      if (!m_parent)
+      {
         destroy_data(m_data, m_num_bins);
       }
     }
   }
 
-  template < typename Container >
+  template<typename Container>
   void reset(Container const& container, T identity)
   {
-    m_identity = identity;
+    m_identity          = identity;
     size_t new_num_bins = container.size();
-    if (new_num_bins != m_num_bins) {
+    if (new_num_bins != m_num_bins)
+    {
       destroy_data(m_data, m_num_bins);
       m_num_bins = new_num_bins;
-      m_data = create_data(container, m_num_bins);
-    } else {
+      m_data     = create_data(container, m_num_bins);
+    }
+    else
+    {
       size_t bin = 0;
-      for (auto const& value : container) {
+      for (auto const& value : container)
+      {
         m_data[bin] = value;
         ++bin;
       }
@@ -122,27 +131,29 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   T identity() const { return m_identity; }
 
-  void combine(size_t bin, T const &val) { MultiReduceOp{}(m_data[bin], val); }
+  void combine(size_t bin, T const& val) { MultiReduceOp {}(m_data[bin], val); }
 
   T get(size_t bin) const { return m_data[bin]; }
 
 private:
-  MultiReduceDataSeq const *m_parent;
+  MultiReduceDataSeq const* m_parent;
   size_t m_num_bins;
   T m_identity;
   T* m_data;
 
-  template < typename Container >
+  template<typename Container>
   static T* create_data(Container const& container, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return nullptr;
     }
 
-    auto data = static_cast<T*>(malloc(num_bins*sizeof(T)));
+    auto data  = static_cast<T*>(malloc(num_bins * sizeof(T)));
     size_t bin = 0;
-    for (auto const& value : container) {
-      new(&data[bin]) T(value);
+    for (auto const& value : container)
+    {
+      new (&data[bin]) T(value);
       ++bin;
     }
     return data;
@@ -150,11 +161,13 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
   static void destroy_data(T*& data, size_t num_bins)
   {
-    if (num_bins == size_t(0)) {
+    if (num_bins == size_t(0))
+    {
       return;
     }
 
-    for (size_t bin = 0; bin < num_bins; ++bin) {
+    for (size_t bin = 0; bin < num_bins; ++bin)
+    {
       data[bin].~T();
     }
     free(data);
@@ -164,7 +177,8 @@ struct MultiReduceDataSeq<T, t_MultiReduceOp,
 
 }  // namespace detail
 
-RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy, detail::MultiReduceDataSeq)
+RAJA_DECLARE_ALL_MULTI_REDUCERS(policy::sequential::seq_multi_reduce_policy,
+                                detail::MultiReduceDataSeq)
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/policy/sequential/params/kernel_name.hpp b/include/RAJA/policy/sequential/params/kernel_name.hpp
index 00e6a1dc52..e5c3318c07 100644
--- a/include/RAJA/policy/sequential/params/kernel_name.hpp
+++ b/include/RAJA/policy/sequential/params/kernel_name.hpp
@@ -3,35 +3,39 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  RAJA_HOST_DEVICE
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>> init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template<typename EXEC_POL, typename T>
+RAJA_HOST_DEVICE camp::concepts::enable_if<
+    std::is_same<EXEC_POL, RAJA::seq_exec>>
+combine(KernelName&, T)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>> resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/params/reduce.hpp b/include/RAJA/policy/sequential/params/reduce.hpp
index b77028ca5f..6b95063481 100644
--- a/include/RAJA/policy/sequential/params/reduce.hpp
+++ b/include/RAJA/policy/sequential/params/reduce.hpp
@@ -3,33 +3,40 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>> init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
 
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>> combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
 
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< std::is_same< EXEC_POL, RAJA::seq_exec> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<std::is_same<EXEC_POL, RAJA::seq_exec>> resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SEQ_REDUCE_HPP
+#endif  //  NEW_REDUCE_SEQ_REDUCE_HPP
diff --git a/include/RAJA/policy/sequential/policy.hpp b/include/RAJA/policy/sequential/policy.hpp
index 287af42502..8677acd44b 100644
--- a/include/RAJA/policy/sequential/policy.hpp
+++ b/include/RAJA/policy/sequential/policy.hpp
@@ -30,7 +30,7 @@ enum struct multi_reduce_algorithm : int
   left_fold
 };
 
-template < multi_reduce_algorithm t_multi_algorithm >
+template<multi_reduce_algorithm t_multi_algorithm>
 struct MultiReduceTuning
 {
   static constexpr multi_reduce_algorithm algorithm = t_multi_algorithm;
@@ -38,7 +38,7 @@ struct MultiReduceTuning
       (algorithm == multi_reduce_algorithm::left_fold);
 };
 
-} // namspace sequential
+}  // namespace sequential
 
 namespace policy
 {
@@ -60,20 +60,20 @@ namespace sequential
 struct seq_region : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::region,
                                                           Launch::sync,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 struct seq_launch_t : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                             Pattern::region,
                                                             Launch::sync,
-                                                            Platform::host> {
-};
+                                                            Platform::host>
+{};
 
 struct seq_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::forall,
                                                         Launch::undefined,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 /// Index set segment iteration policies
@@ -86,8 +86,8 @@ using seq_segit = seq_exec;
 struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                         Pattern::workgroup_exec,
                                                         Launch::sync,
-                                                        Platform::host> {
-};
+                                                        Platform::host>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -99,20 +99,20 @@ struct seq_work : make_policy_pattern_launch_platform_t<Policy::sequential,
 struct seq_reduce : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                           Pattern::reduce,
                                                           Launch::undefined,
-                                                          Platform::host> {
-};
+                                                          Platform::host>
+{};
 
 ///
-template < typename tuning >
-struct seq_multi_reduce_policy
-    : make_policy_pattern_launch_platform_t<Policy::sequential,
-                                            Pattern::multi_reduce,
-                                            Launch::undefined,
-                                            Platform::host,
-                                            std::conditional_t<tuning::consistent,
-                                                               reduce::ordered,
-                                                               reduce::unordered>> {
-};
+template<typename tuning>
+struct seq_multi_reduce_policy : make_policy_pattern_launch_platform_t<
+                                     Policy::sequential,
+                                     Pattern::multi_reduce,
+                                     Launch::undefined,
+                                     Platform::host,
+                                     std::conditional_t<tuning::consistent,
+                                                        reduce::ordered,
+                                                        reduce::unordered>>
+{};
 
 ///
 ///////////////////////////////////////////////////////////////////////
@@ -121,13 +121,12 @@ struct seq_multi_reduce_policy
 ///
 ///////////////////////////////////////////////////////////////////////
 ///
-struct seq_atomic {
-};
-
+struct seq_atomic
+{};
 
-template < RAJA::sequential::multi_reduce_algorithm algorithm >
-using seq_multi_reduce_tuning = seq_multi_reduce_policy<
-    RAJA::sequential::MultiReduceTuning<algorithm> >;
+template<RAJA::sequential::multi_reduce_algorithm algorithm>
+using seq_multi_reduce_tuning =
+    seq_multi_reduce_policy<RAJA::sequential::MultiReduceTuning<algorithm>>;
 
 // Policies for RAJA::MultiReduce* objects with specific behaviors.
 // - left_fold policies combine new values into a single value.
@@ -143,12 +142,12 @@ using seq_multi_reduce = seq_multi_reduce_left_fold;
 
 using policy::sequential::seq_atomic;
 using policy::sequential::seq_exec;
-using policy::sequential::seq_reduce;
+using policy::sequential::seq_launch_t;
 using policy::sequential::seq_multi_reduce;
+using policy::sequential::seq_reduce;
 using policy::sequential::seq_region;
 using policy::sequential::seq_segit;
 using policy::sequential::seq_work;
-using policy::sequential::seq_launch_t;
 
 
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sequential/reduce.hpp b/include/RAJA/policy/sequential/reduce.hpp
index 0870726183..d547185fbd 100644
--- a/include/RAJA/policy/sequential/reduce.hpp
+++ b/include/RAJA/policy/sequential/reduce.hpp
@@ -37,7 +37,7 @@ namespace RAJA
 
 namespace detail
 {
-template <typename T, typename Reduce>
+template<typename T, typename Reduce>
 class ReduceSeq
     : public reduce::detail::BaseCombinable<T, Reduce, ReduceSeq<T, Reduce>>
 {
diff --git a/include/RAJA/policy/sequential/region.hpp b/include/RAJA/policy/sequential/region.hpp
index 84d03ae202..ebbf8f53d9 100644
--- a/include/RAJA/policy/sequential/region.hpp
+++ b/include/RAJA/policy/sequential/region.hpp
@@ -34,8 +34,8 @@ namespace sequential
  *
  */
 
-template <typename Func>
-RAJA_INLINE void region_impl(const seq_region &, Func &&body)
+template<typename Func>
+RAJA_INLINE void region_impl(const seq_region&, Func&& body)
 {
   body();
 }
diff --git a/include/RAJA/policy/sequential/scan.hpp b/include/RAJA/policy/sequential/scan.hpp
index 4bcc73366d..e3008267a9 100644
--- a/include/RAJA/policy/sequential/scan.hpp
+++ b/include/RAJA/policy/sequential/scan.hpp
@@ -40,23 +40,22 @@ namespace scan
         \brief explicit inclusive inplace scan given range, function, and
    initial value
 */
-template <typename ExecPolicy, typename Iter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f)
+template<typename ExecPolicy, typename Iter, typename BinFn>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = *begin;
+  ValueT agg   = *begin;
 
-  for (Iter i = ++begin; i != end; ++i) {
+  for (Iter i = ++begin; i != end; ++i)
+  {
     agg = f(agg, *i);
-    *i = agg;
+    *i  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -66,29 +65,28 @@ inclusive_inplace(
         \brief explicit exclusive inplace scan given range, function, and
    initial value
 */
-template <typename ExecPolicy, typename Iter, typename BinFn, typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive_inplace(
-    resources::Host host_res,
-    const ExecPolicy &,
-    Iter begin,
-    Iter end,
-    BinFn f,
-    T v)
+template<typename ExecPolicy, typename Iter, typename BinFn, typename T>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive_inplace(resources::Host host_res,
+                  const ExecPolicy&,
+                  Iter begin,
+                  Iter end,
+                  BinFn f,
+                  T v)
 {
   using std::distance;
-  const auto n = distance(begin, end);
+  const auto n    = distance(begin, end);
   using DistanceT = typename std::remove_const<decltype(n)>::type;
 
   using ValueT = typename std::remove_reference<decltype(*begin)>::type;
-  ValueT agg = v;
+  ValueT agg   = v;
 
-  for (DistanceT i = 0; i < n; ++i) {
-    auto t = begin[i];
+  for (DistanceT i = 0; i < n; ++i)
+  {
+    auto t   = begin[i];
     begin[i] = agg;
-    agg = f(agg, t);
+    agg      = f(agg, t);
   }
 
   return resources::EventProxy<resources::Host>(host_res);
@@ -98,24 +96,23 @@ exclusive_inplace(
         \brief explicit inclusive scan given input range, output, function, and
    initial value
 */
-template <typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-inclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f)
+template<typename ExecPolicy, typename Iter, typename OutIter, typename BinFn>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+inclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = *begin;
-  *out++ = agg;
+  ValueT agg   = *begin;
+  *out++       = agg;
 
-  for (Iter i = begin + 1; i != end; ++i) {
-    agg = f(agg, *i);
+  for (Iter i = begin + 1; i != end; ++i)
+  {
+    agg    = f(agg, *i);
     *out++ = agg;
   }
 
@@ -126,31 +123,30 @@ inclusive(
         \brief explicit exclusive scan given input range, output, function, and
    initial value
 */
-template <typename ExecPolicy,
-          typename Iter,
-          typename OutIter,
-          typename BinFn,
-          typename T>
-RAJA_INLINE
-concepts::enable_if_t<resources::EventProxy<resources::Host>,
-                      type_traits::is_sequential_policy<ExecPolicy>>
-exclusive(
-    resources::Host host_res,
-    const ExecPolicy &,
-    const Iter begin,
-    const Iter end,
-    OutIter out,
-    BinFn f,
-    T v)
+template<typename ExecPolicy,
+         typename Iter,
+         typename OutIter,
+         typename BinFn,
+         typename T>
+RAJA_INLINE concepts::enable_if_t<resources::EventProxy<resources::Host>,
+                                  type_traits::is_sequential_policy<ExecPolicy>>
+exclusive(resources::Host host_res,
+          const ExecPolicy&,
+          const Iter begin,
+          const Iter end,
+          OutIter out,
+          BinFn f,
+          T v)
 {
   using ValueT = typename std::remove_reference<decltype(*out)>::type;
-  ValueT agg = v;
-  OutIter o = out;
-  *o++ = v;
+  ValueT agg   = v;
+  OutIter o    = out;
+  *o++         = v;
 
-  for (Iter i = begin; i != end - 1; ++i, ++o) {
+  for (Iter i = begin; i != end - 1; ++i, ++o)
+  {
     agg = f(agg, *i);
-    *o = agg;
+    *o  = agg;
   }
 
   return resources::EventProxy<resources::Host>(host_res);
diff --git a/include/RAJA/policy/sequential/sort.hpp b/include/RAJA/policy/sequential/sort.hpp
index 98dcf6fc27..61ab228c58 100644
--- a/include/RAJA/policy/sequential/sort.hpp
+++ b/include/RAJA/policy/sequential/sort.hpp
@@ -30,7 +30,7 @@
 
 #include "RAJA/util/zip.hpp"
 
-#include "RAJA/util/sort.hpp" 
+#include "RAJA/util/sort.hpp"
 
 #include "RAJA/policy/sequential/policy.hpp"
 
@@ -50,9 +50,8 @@ namespace detail
 */
 struct UnstableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template<typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::intro_sort(std::forward<Args>(args)...);
   }
@@ -64,30 +63,28 @@ struct UnstableSorter
 */
 struct StableSorter
 {
-  template < typename... Args >
-  RAJA_INLINE
-  void operator()(Args&&... args) const
+  template<typename... Args>
+  RAJA_INLINE void operator()(Args&&... args) const
   {
     RAJA::detail::merge_sort(std::forward<Args>(args)...);
   }
 };
 
-} // namespace detail
+}  // namespace detail
 
 /*!
         \brief sort given range using comparison function
 */
-template <typename ExecPolicy, typename Iter, typename Compare>
+template<typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+unstable(resources::Host host_res,
+         const ExecPolicy&,
+         Iter begin,
+         Iter end,
+         Compare comp)
 {
-  detail::UnstableSorter{}(begin, end, comp);
+  detail::UnstableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -95,17 +92,16 @@ unstable(
 /*!
         \brief stable sort given range using comparison function
 */
-template <typename ExecPolicy, typename Iter, typename Compare>
+template<typename ExecPolicy, typename Iter, typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable(
-    resources::Host host_res,
-    const ExecPolicy&,
-    Iter begin,
-    Iter end,
-    Compare comp)
+stable(resources::Host host_res,
+       const ExecPolicy&,
+       Iter begin,
+       Iter end,
+       Compare comp)
 {
-  detail::StableSorter{}(begin, end, comp);
+  detail::StableSorter {}(begin, end, comp);
 
   return resources::EventProxy<resources::Host>(host_res);
 }
@@ -113,43 +109,48 @@ stable(
 /*!
         \brief sort given range of pairs using comparison function on keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template<typename ExecPolicy,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-unstable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+unstable_pairs(resources::Host host_res,
+               const ExecPolicy&,
+               KeyIter keys_begin,
+               KeyIter keys_end,
+               ValIter vals_begin,
+               Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::UnstableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::UnstableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
 
 /*!
-        \brief stable sort given range of pairs using comparison function on keys
+        \brief stable sort given range of pairs using comparison function on
+   keys
 */
-template <typename ExecPolicy, typename KeyIter, typename ValIter, typename Compare>
+template<typename ExecPolicy,
+         typename KeyIter,
+         typename ValIter,
+         typename Compare>
 concepts::enable_if_t<resources::EventProxy<resources::Host>,
                       type_traits::is_sequential_policy<ExecPolicy>>
-stable_pairs(
-    resources::Host host_res,
-    const ExecPolicy&,
-    KeyIter keys_begin,
-    KeyIter keys_end,
-    ValIter vals_begin,
-    Compare comp)
+stable_pairs(resources::Host host_res,
+             const ExecPolicy&,
+             KeyIter keys_begin,
+             KeyIter keys_end,
+             ValIter vals_begin,
+             Compare comp)
 {
-  auto begin = RAJA::zip(keys_begin, vals_begin);
-  auto end = RAJA::zip(keys_end, vals_begin+(keys_end-keys_begin));
+  auto begin    = RAJA::zip(keys_begin, vals_begin);
+  auto end      = RAJA::zip(keys_end, vals_begin + (keys_end - keys_begin));
   using zip_ref = RAJA::detail::IterRef<camp::decay<decltype(begin)>>;
-  detail::StableSorter{}(begin, end, RAJA::compare_first<zip_ref>(comp));
+  detail::StableSorter {}(begin, end, RAJA::compare_first<zip_ref>(comp));
 
   return resources::EventProxy<resources::Host>(host_res);
 }
diff --git a/include/RAJA/policy/simd/forall.hpp b/include/RAJA/policy/simd/forall.hpp
index 8c5b38af9c..680bf4cfee 100644
--- a/include/RAJA/policy/simd/forall.hpp
+++ b/include/RAJA/policy/simd/forall.hpp
@@ -47,26 +47,25 @@ namespace simd
 {
 
 
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-  >
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam f_params)
 {
   expt::ParamMultiplexer::init<seq_exec>(f_params);
 
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     expt::invoke_body(f_params, loop_body, *(begin + i));
   }
 
@@ -74,24 +73,23 @@ forall_impl(RAJA::resources::Host host_res,
   return RAJA::resources::EventProxy<resources::Host>(host_res);
 }
 
-template <typename Iterable, typename Func, typename ForallParam>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Host>,
-  expt::type_traits::is_ForallParamPack<ForallParam>,
-  expt::type_traits::is_ForallParamPack_empty<ForallParam>
-  >
+template<typename Iterable, typename Func, typename ForallParam>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Host>,
+    expt::type_traits::is_ForallParamPack<ForallParam>,
+    expt::type_traits::is_ForallParamPack_empty<ForallParam>>
 forall_impl(RAJA::resources::Host host_res,
-            const simd_exec &,
-            Iterable &&iter,
-            Func &&loop_body,
+            const simd_exec&,
+            Iterable&& iter,
+            Func&& loop_body,
             ForallParam)
 {
-  auto begin = std::begin(iter);
-  auto end = std::end(iter);
+  auto begin    = std::begin(iter);
+  auto end      = std::end(iter);
   auto distance = std::distance(begin, end);
   RAJA_SIMD
-  for (decltype(distance) i = 0; i < distance; ++i) {
+  for (decltype(distance) i = 0; i < distance; ++i)
+  {
     loop_body(*(begin + i));
   }
 
diff --git a/include/RAJA/policy/simd/kernel/For.hpp b/include/RAJA/policy/simd/kernel/For.hpp
index 53ed45ad1f..223aecfdf5 100644
--- a/include/RAJA/policy/simd/kernel/For.hpp
+++ b/include/RAJA/policy/simd/kernel/For.hpp
@@ -39,13 +39,15 @@ namespace internal
  * Helper structs to detect lambdas
  *
  */
-template <class T>
-struct TypeIsLambda {
+template<class T>
+struct TypeIsLambda
+{
   static const bool value = false;
 };
 
-template <camp::idx_t BodyIdx, typename ... Args>
-struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
+template<camp::idx_t BodyIdx, typename... Args>
+struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>>
+{
   static const bool value = true;
 };
 
@@ -55,30 +57,31 @@ struct TypeIsLambda<RAJA::statement::Lambda<BodyIdx, Args...>> {
  *
  */
 
-template <typename Types, class... Statements>
+template<typename Types, class... Statements>
 struct Invoke_all_Lambda;
 
-template <typename Types>
-struct Invoke_all_Lambda<Types> {
+template<typename Types>
+struct Invoke_all_Lambda<Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&)
+  template<typename Data>
+  static RAJA_INLINE void lambda_special(Data&&)
   {
     // NOP terminator
   }
 };
 
-
-template <typename Types, class Statement, class... StatementRest>
-struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
+template<typename Types, class Statement, class... StatementRest>
+struct Invoke_all_Lambda<Types, Statement, StatementRest...>
+{
 
   // Lambda check
   static const bool value = TypeIsLambda<camp::decay<Statement>>::value;
   static_assert(value, "Lambdas are only supported post RAJA::simd_exec");
 
   // Invoke the chain of lambdas
-  template <typename Data>
-  static RAJA_INLINE void lambda_special(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void lambda_special(Data&& data)
   {
 
     // Execute this Lambda
@@ -89,41 +92,44 @@ struct Invoke_all_Lambda<Types, Statement, StatementRest...> {
   }
 };
 
-
 /*!
  * RAJA::kernel forall_impl executor specialization for statement::For.
  * Assumptions: RAJA::simd_exec is the inner most policy,
  * only one lambda is used, no reductions are done within the lambda.
  * Assigns the loop index to offset ArgumentId
  */
-template <camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
+template<camp::idx_t ArgumentId, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>, Types> {
+    statement::For<ArgumentId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
       // Assign offset on privatized data
       private_data.template assign_offset<ArgumentId>(i);
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
diff --git a/include/RAJA/policy/simd/kernel/ForICount.hpp b/include/RAJA/policy/simd/kernel/ForICount.hpp
index 36a169f2bf..ac6e9c542a 100644
--- a/include/RAJA/policy/simd/kernel/ForICount.hpp
+++ b/include/RAJA/policy/simd/kernel/ForICount.hpp
@@ -42,26 +42,31 @@ namespace internal
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <camp::idx_t ArgumentId, typename ParamId,
-          typename... EnclosedStmts, typename Types>
+template<camp::idx_t ArgumentId,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct StatementExecutor<
-    statement::ForICount<ArgumentId, ParamId, RAJA::simd_exec,
-                         EnclosedStmts...>, Types> {
+    statement::
+        ForICount<ArgumentId, ParamId, RAJA::simd_exec, EnclosedStmts...>,
+    Types>
+{
 
-  template <typename Data>
-  static RAJA_INLINE void exec(Data &&data)
+  template<typename Data>
+  static RAJA_INLINE void exec(Data&& data)
   {
 
     // Set the argument type for this loop
     using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
-    auto iter = get<ArgumentId>(data.segment_tuple);
-    auto begin = std::begin(iter);
-    auto end = std::end(iter);
+    auto iter     = get<ArgumentId>(data.segment_tuple);
+    auto begin    = std::begin(iter);
+    auto end      = std::end(iter);
     auto distance = std::distance(begin, end);
 
     RAJA_SIMD
-    for (decltype(distance) i = 0; i < distance; ++i) {
+    for (decltype(distance) i = 0; i < distance; ++i)
+    {
 
       // Offsets and parameters need to be privatized
       data.template assign_offset<ArgumentId>(i);
@@ -69,10 +74,11 @@ struct StatementExecutor<
 
       // Privatize data for SIMD correctness reasons
       using RAJA::internal::thread_privatize;
-      auto privatizer = thread_privatize(data);
+      auto privatizer    = thread_privatize(data);
       auto& private_data = privatizer.get_priv();
 
-      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(private_data);
+      Invoke_all_Lambda<NewTypes, EnclosedStmts...>::lambda_special(
+          private_data);
     }
   }
 };
@@ -81,4 +87,4 @@ struct StatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/simd/launch.hpp b/include/RAJA/policy/simd/launch.hpp
index 1f8ba01ab3..d6c2b741e6 100644
--- a/include/RAJA/policy/simd/launch.hpp
+++ b/include/RAJA/policy/simd/launch.hpp
@@ -21,41 +21,44 @@
 #include "RAJA/pattern/launch/launch_core.hpp"
 #include "RAJA/policy/simd/policy.hpp"
 
-
 namespace RAJA
 {
 
-template <typename SEGMENT>
-struct LoopExecute<simd_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopExecute<simd_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i));
     }
   }
 };
 
-template <typename SEGMENT>
-struct LoopICountExecute<simd_exec, SEGMENT> {
+template<typename SEGMENT>
+struct LoopICountExecute<simd_exec, SEGMENT>
+{
 
-  template <typename BODY>
+  template<typename BODY>
   static RAJA_INLINE RAJA_HOST_DEVICE void exec(
       LaunchContext const RAJA_UNUSED_ARG(&ctx),
-      SEGMENT const &segment,
-      BODY const &body)
+      SEGMENT const& segment,
+      BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     RAJA_SIMD
-    for (int i = 0; i < len; i++) {
+    for (int i = 0; i < len; i++)
+    {
       body(*(segment.begin() + i), i);
     }
   }
diff --git a/include/RAJA/policy/simd/policy.hpp b/include/RAJA/policy/simd/policy.hpp
index a85811163f..fac158a36b 100644
--- a/include/RAJA/policy/simd/policy.hpp
+++ b/include/RAJA/policy/simd/policy.hpp
@@ -41,8 +41,8 @@ namespace simd
 struct simd_exec : make_policy_pattern_launch_platform_t<Policy::sequential,
                                                          Pattern::forall,
                                                          Launch::undefined,
-                                                         Platform::host> {
-};
+                                                         Platform::host>
+{};
 
 }  // end of namespace simd
 
diff --git a/include/RAJA/policy/sycl.hpp b/include/RAJA/policy/sycl.hpp
index 491e39910c..81f16d4918 100644
--- a/include/RAJA/policy/sycl.hpp
+++ b/include/RAJA/policy/sycl.hpp
@@ -29,13 +29,13 @@
 #include "RAJA/policy/sycl/forall.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/reduce.hpp"
-//#include "RAJA/policy/sycl/multi_reduce.hpp"
-//#include "RAJA/policy/sycl/scan.hpp"
-//#include "RAJA/policy/sycl/sort.hpp"
+// #include "RAJA/policy/sycl/multi_reduce.hpp"
+// #include "RAJA/policy/sycl/scan.hpp"
+// #include "RAJA/policy/sycl/sort.hpp"
 #include "RAJA/policy/sycl/kernel.hpp"
-//#include "RAJA/policy/sycl/synchronize.hpp"
+// #include "RAJA/policy/sycl/synchronize.hpp"
 #include "RAJA/policy/sycl/launch.hpp"
-//#include "RAJA/policy/sycl/WorkGroup.hpp"
+// #include "RAJA/policy/sycl/WorkGroup.hpp"
 
 #endif  // closing endif for if defined(RAJA_ENABLE_SYCL)
 
diff --git a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
index 081a88dc23..e1c6cbc884 100644
--- a/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
+++ b/include/RAJA/policy/sycl/MemUtils_SYCL.hpp
@@ -47,10 +47,11 @@ namespace detail
 {
 
 //! struct containing data necessary to coordinate kernel launches with reducers
-struct syclInfo {
-  sycl_dim_t gridDim{0};
-  sycl_dim_t blockDim{0};
-  cl::sycl::queue qu = cl::sycl::queue();
+struct syclInfo
+{
+  sycl_dim_t gridDim {0};
+  sycl_dim_t blockDim {0};
+  cl::sycl::queue qu  = cl::sycl::queue();
   bool setup_reducers = false;
 #if defined(RAJA_ENABLE_OPENMP)
   syclInfo* thread_states = nullptr;
@@ -67,14 +68,15 @@ extern std::unordered_map<cl::sycl::queue, bool> g_queue_info_map;
 }  // namespace detail
 
 //! Allocator for pinned memory for use in basic_mempool
-struct PinnedAllocator {
+struct PinnedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_host(nbytes, *q);
+    ptr              = ::sycl::malloc_host(nbytes, *q);
     return ptr;
   }
 
@@ -89,14 +91,15 @@ struct PinnedAllocator {
 };
 
 //! Allocator for device memory for use in basic_mempool
-struct DeviceAllocator {
+struct DeviceAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     return ptr;
   }
 
@@ -112,14 +115,15 @@ struct DeviceAllocator {
 
 //! Allocator for pre-zeroed device memory for use in basic_mempool
 //  Note: Memory must be zero when returned to mempool
-struct DeviceZeroedAllocator {
+struct DeviceZeroedAllocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes)
   {
     void* ptr;
     ::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
-    ptr = ::sycl::malloc_device(nbytes, *q);
+    ptr              = ::sycl::malloc_device(nbytes, *q);
     q->memset(ptr, 0, nbytes);
     return ptr;
   }
@@ -146,4 +150,3 @@ using pinned_mempool_type = basic_mempool::MemPool<PinnedAllocator>;
 #endif  // closing endif for RAJA_ENABLE_SYCL
 
 #endif  // closing endif for header file include guard
-
diff --git a/include/RAJA/policy/sycl/forall.hpp b/include/RAJA/policy/sycl/forall.hpp
index 901cc694f0..5b6f99b3df 100644
--- a/include/RAJA/policy/sycl/forall.hpp
+++ b/include/RAJA/policy/sycl/forall.hpp
@@ -76,7 +76,6 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 
 }  // namespace impl
 
-
 //
 ////////////////////////////////////////////////////////////////////////
 //
@@ -85,14 +84,18 @@ ::sycl::range<1> getGridDim(size_t len, size_t block_size)
 ////////////////////////////////////////////////////////////////////////
 //
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t<
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
-forall_impl(resources::Sycl &sycl_res,
+template<typename Iterable,
+         typename LoopBody,
+         size_t BlockSize,
+         bool Async,
+         typename ForallParam,
+         typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                 bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -101,17 +104,19 @@ forall_impl(resources::Sycl &sycl_res,
 
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
     //       For now, leave the device compiler to error with invalid WG size.
@@ -119,51 +124,60 @@ forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
     q->submit([&](::sycl::handler& h) {
-
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        IndexType ii = it.get_global_id(0);
-        if (ii < len) {
-          loop_body(begin[ii]);
-        }
-      });
+      h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                     [=](::sycl::nd_item<1> it) {
+                       IndexType ii = it.get_global_id(0);
+                       if (ii < len)
+                       {
+                         loop_body(begin[ii]);
+                       }
+                     });
     });
 
-    if (!Async) { q->wait(); }
+    if (!Async)
+    {
+      q->wait();
+    }
   }
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE 
-resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
-            sycl_exec<BlockSize, Async>,
-            Iterable&& iter,
-            LoopBody&& loop_body,
-            ForallParam)
+template<typename Iterable,
+         typename LoopBody,
+         size_t BlockSize,
+         bool Async,
+         typename ForallParam,
+         typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                 bool>::type = true>
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& sycl_res,
+    sycl_exec<BlockSize, Async>,
+    Iterable&& iter,
+    LoopBody&& loop_body,
+    ForallParam)
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
 
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     // Note: We could fix an incorrect workgroup size.
     //       It would change what was specified.
@@ -172,7 +186,7 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
@@ -186,24 +200,23 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
     q->submit([&](::sycl::handler& h) {
+       h.parallel_for(::sycl::nd_range<1> {gridSize, blockSize},
+                      [=](::sycl::nd_item<1> it) {
+                        Index_type ii = it.get_global_id(0);
 
-      h.parallel_for( ::sycl::nd_range<1>{gridSize, blockSize},
-                      [=]  (::sycl::nd_item<1> it) {
-
-        Index_type ii = it.get_global_id(0);
-
-        if (ii < len) {
-          (*lbody)((*beg)[ii]);
-        }
-      });
-    }).wait(); // Need to wait for completion to free memory
+                        if (ii < len)
+                        {
+                          (*lbody)((*beg)[ii]);
+                        }
+                      });
+     }).wait();  // Need to wait for completion to free memory
 
     // Free our device memory
     cl::sycl::free(lbody, *q);
@@ -215,14 +228,19 @@ resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &sycl_res,
   return resources::EventProxy<resources::Sycl>(sycl_res);
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template<typename Iterable,
+         typename LoopBody,
+         size_t BlockSize,
+         bool Async,
+         typename ForallParam,
+         typename std::enable_if<std::is_trivially_copyable<LoopBody> {},
+                                 bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -231,70 +249,75 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
 
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        IndexType ii = it.get_id(0);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
-        }
-        red.combine(fp);
-      });
+      h.parallel_for(::sycl::range<1>(len), reduction,
+                     [=](::sycl::item<1> it, auto& red) {
+                       ForallParam fp;
+                       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                       IndexType ii = it.get_id(0);
+                       if (ii < len)
+                       {
+                         RAJA::expt::invoke_body(fp, loop_body, begin[ii]);
+                       }
+                       red.combine(fp);
+                     });
     });
 
     q->wait();
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     ::sycl::free(res, *q);
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-template <typename Iterable, typename LoopBody, size_t BlockSize, bool Async, typename ForallParam,
-          typename std::enable_if<!std::is_trivially_copyable<LoopBody>{},bool>::type = true>
-RAJA_INLINE
-concepts::enable_if_t< 
-  resources::EventProxy<resources::Sycl>,
-  RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
-  concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>> >
-forall_impl(resources::Sycl &sycl_res,
+template<typename Iterable,
+         typename LoopBody,
+         size_t BlockSize,
+         bool Async,
+         typename ForallParam,
+         typename std::enable_if<!std::is_trivially_copyable<LoopBody> {},
+                                 bool>::type = true>
+RAJA_INLINE concepts::enable_if_t<
+    resources::EventProxy<resources::Sycl>,
+    RAJA::expt::type_traits::is_ForallParamPack<ForallParam>,
+    concepts::negate<
+        RAJA::expt::type_traits::is_ForallParamPack_empty<ForallParam>>>
+forall_impl(resources::Sycl& sycl_res,
             sycl_exec<BlockSize, Async>,
             Iterable&& iter,
             LoopBody&& loop_body,
@@ -303,29 +326,31 @@ forall_impl(resources::Sycl &sycl_res,
 {
   using Iterator  = camp::decay<decltype(std::begin(iter))>;
   using LOOP_BODY = camp::decay<LoopBody>;
-  using IndexType = camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
+  using IndexType =
+      camp::decay<decltype(std::distance(std::begin(iter), std::end(iter)))>;
   using EXEC_POL = RAJA::sycl_exec<BlockSize, Async>;
   //
   // Compute the requested iteration space size
   //
   Iterator begin = std::begin(iter);
-  Iterator end = std::end(iter);
-  IndexType len = std::distance(begin, end);
+  Iterator end   = std::end(iter);
+  IndexType len  = std::distance(begin, end);
 
   RAJA::expt::ParamMultiplexer::init<EXEC_POL>(f_params);
 
   // Only launch kernel if we have something to iterate over
-  if (len > 0 && BlockSize > 0) {
+  if (len > 0 && BlockSize > 0)
+  {
     //
     // Compute the number of blocks
     //
-    sycl_dim_t blockSize{BlockSize};
+    sycl_dim_t blockSize {BlockSize};
     sycl_dim_t gridSize = impl::getGridDim(static_cast<size_t>(len), BlockSize);
 
     ::sycl::queue* q = sycl_res.get_queue();
 
-    auto combiner = []( ForallParam x, ForallParam y ) {
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+    auto combiner = [](ForallParam x, ForallParam y) {
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
       return x;
     };
 
@@ -339,48 +364,42 @@ forall_impl(resources::Sycl &sycl_res,
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    lbody = (LOOP_BODY*) ::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+    lbody = (LOOP_BODY*)::sycl::malloc_device(sizeof(LOOP_BODY), *q);
     q->memcpy(lbody, &loop_body, sizeof(LOOP_BODY)).wait();
 
-    beg = (Iterator*) ::sycl::malloc_device(sizeof(Iterator), *q);
+    beg = (Iterator*)::sycl::malloc_device(sizeof(Iterator), *q);
     q->memcpy(beg, &begin, sizeof(Iterator)).wait();
 
-    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1,*q);
+    ForallParam* res = ::sycl::malloc_shared<ForallParam>(1, *q);
     RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
     auto reduction = ::sycl::reduction(res, f_params, combiner);
 
     q->submit([&](::sycl::handler& h) {
-      h.parallel_for( ::sycl::range<1>(len),
-                      reduction,
-                      [=]   (::sycl::item<1> it, auto & red)  {
-
-
-        Index_type ii = it.get_id(0);
-        ForallParam fp;
-	RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
-        if (ii < len) {
-          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
-        }
-        red.combine(fp);
-
-      });
-    }).wait(); // Need to wait for completion to free memory
-    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( f_params, *res );
+       h.parallel_for(::sycl::range<1>(len), reduction,
+                      [=](::sycl::item<1> it, auto& red) {
+                        Index_type ii = it.get_id(0);
+                        ForallParam fp;
+                        RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+                        if (ii < len)
+                        {
+                          RAJA::expt::invoke_body(fp, *lbody, (*beg)[ii]);
+                        }
+                        red.combine(fp);
+                      });
+     }).wait();  // Need to wait for completion to free memory
+    RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(f_params, *res);
     // Free our device memory
     ::sycl::free(res, *q);
     ::sycl::free(lbody, *q);
     ::sycl::free(beg, *q);
 
     RAJA_FT_END;
-
   }
   RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(f_params);
 
   return resources::EventProxy<resources::Sycl>(sycl_res);
-
 }
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -399,27 +418,27 @@ forall_impl(resources::Sycl &sycl_res,
  *
  ******************************************************************************
  */
-template <typename LoopBody,
-          size_t BlockSize,
-          bool Async,
-          typename... SegmentTypes>
-RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(resources::Sycl &r,
-                                                    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
-                                                    const TypedIndexSet<SegmentTypes...>& iset,
-                                                    LoopBody&& loop_body)
+template<typename LoopBody,
+         size_t BlockSize,
+         bool Async,
+         typename... SegmentTypes>
+RAJA_INLINE resources::EventProxy<resources::Sycl> forall_impl(
+    resources::Sycl& r,
+    ExecPolicy<seq_segit, sycl_exec<BlockSize, Async>>,
+    const TypedIndexSet<SegmentTypes...>& iset,
+    LoopBody&& loop_body)
 {
   int num_seg = iset.getNumSegments();
-  for (int isi = 0; isi < num_seg; ++isi) {
-    iset.segmentCall(r,
-                     isi,
-                     detail::CallForall(),
-                     sycl_exec<BlockSize, true>(),
+  for (int isi = 0; isi < num_seg; ++isi)
+  {
+    iset.segmentCall(r, isi, detail::CallForall(), sycl_exec<BlockSize, true>(),
                      loop_body);
   }  // iterate over segments of index set
 
-  if ( !Async ) {
+  if (!Async)
+  {
     ::sycl::queue* q = r.get_queue();
-    q->wait(); 
+    q->wait();
   }
 
   return resources::EventProxy<resources::Sycl>(r);
diff --git a/include/RAJA/policy/sycl/kernel.hpp b/include/RAJA/policy/sycl/kernel.hpp
index 641c3a9ef3..803bcd49e0 100644
--- a/include/RAJA/policy/sycl/kernel.hpp
+++ b/include/RAJA/policy/sycl/kernel.hpp
@@ -23,11 +23,11 @@
 #include "RAJA/policy/sycl/kernel/SyclKernel.hpp"
 #include "RAJA/policy/sycl/kernel/For.hpp"
 #include "RAJA/policy/sycl/kernel/ForICount.hpp"
-//#include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
-//#include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
+// #include "RAJA/policy/sycl/kernel/Hyperplane.hpp"
+// #include "RAJA/policy/sycl/kernel/InitLocalMem.hpp"
 #include "RAJA/policy/sycl/kernel/Lambda.hpp"
-//#include "RAJA/policy/sycl/kernel/Reduce.hpp"
-//#include "RAJA/policy/sycl/kernel/Sync.hpp"
+// #include "RAJA/policy/sycl/kernel/Reduce.hpp"
+// #include "RAJA/policy/sycl/kernel/Sync.hpp"
 #include "RAJA/policy/sycl/kernel/Tile.hpp"
 #include "RAJA/policy/sycl/kernel/TileTCount.hpp"
 #include "RAJA/policy/sycl/kernel/internal.hpp"
diff --git a/include/RAJA/policy/sycl/kernel/Conditional.hpp b/include/RAJA/policy/sycl/kernel/Conditional.hpp
index 9149418518..1b832c8b6a 100644
--- a/include/RAJA/policy/sycl/kernel/Conditional.hpp
+++ b/include/RAJA/policy/sycl/kernel/Conditional.hpp
@@ -36,35 +36,31 @@ namespace internal
 {
 
 
-template <typename Data,
-          typename Conditional,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         typename Conditional,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<Data,
                              statement::If<Conditional, EnclosedStmts...>,
-                             Types> {
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
-
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
-    if (Conditional::eval(data)) {
+    if (Conditional::eval(data))
+    {
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active);
     }
   }
 
-
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
diff --git a/include/RAJA/policy/sycl/kernel/For.hpp b/include/RAJA/policy/sycl/kernel/For.hpp
index d0976b931f..36a3fcd4ce 100644
--- a/include/RAJA/policy/sycl/kernel/For.hpp
+++ b/include/RAJA/policy/sycl/kernel/For.hpp
@@ -22,7 +22,6 @@
 
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -37,16 +36,19 @@ namespace internal
  * Mapping directly to indicies
  * Assigns the global index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Dim,
-          int Local_Size,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Dim,
+         int Local_Size,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_global_012<Dim, Local_Size>, EnclosedStmts...>,
-    Types> {
+    statement::For<ArgumentId,
+                   RAJA::sycl_global_012<Dim, Local_Size>,
+                   EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -58,38 +60,40 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(Dim);
+    auto i   = item.get_global_id(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // Set Global Space for Dimension and Local Size
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.global.x = len;
-      dims.local.x = Local_Size;
+      dims.local.x  = Local_Size;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.global.y = len;
-      dims.local.y = Local_Size;
+      dims.local.y  = Local_Size;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.global.z = len;
-      dims.local.z = Local_Size;
+      dims.local.z  = Local_Size;
     }
 
     // combine with enclosed statements
@@ -103,15 +107,17 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Dim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -123,34 +129,36 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(Dim);
+    auto i   = item.get_group(Dim);
 
-      // Assign the x thread to the argument
-      data.template assign_offset<ArgumentId>(i);
+    // Assign the x thread to the argument
+    data.template assign_offset<ArgumentId>(i);
 
-      // execute enclosed statements
-      enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    // execute enclosed statements
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -166,15 +174,17 @@ struct SyclStatementExecutor<
  * each group in dims.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_group_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Dim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_group_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -186,14 +196,16 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_group(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_group(Dim);
     auto i_stride = item.get_group_range(Dim);
 
-    for(auto i = i0;i < len;i += i_stride){
+    for (auto i = i0; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -203,21 +215,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.group.x = len;
-    } 
-    if (Dim == 1) {
+    }
+    if (Dim == 1)
+    {
       dims.group.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.group.z = len;
     }
 
@@ -232,15 +245,17 @@ struct SyclStatementExecutor<
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_direct<Dim>, EnclosedStmts...>,
-    Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Dim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_direct<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -252,35 +267,36 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(Dim);
+    auto i   = item.get_local_id(Dim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -296,15 +312,17 @@ struct SyclStatementExecutor<
  * for each item in dim.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Dim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::For<ArgumentId, RAJA::sycl_local_012_loop<Dim>, EnclosedStmts...>,
-    Types> {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Dim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::For<ArgumentId,
+                                            RAJA::sycl_local_012_loop<Dim>,
+                                            EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -316,15 +334,17 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
-    auto len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(Dim);
+    auto len      = segment_length<ArgumentId>(data);
+    auto i0       = item.get_local_id(Dim);
     auto i_stride = item.get_local_range(Dim);
-    auto i = i0;
+    auto i        = i0;
 
-    for(; i < len;i += i_stride){
+    for (; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -333,7 +353,7 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active);
     }
     // do we need one more masked iteration?
-    if(i - i0 < len)
+    if (i - i0 < len)
     {
       // execute enclosed statements one more time, but masking them off
       // this is because there's at least one thread that isn't masked off
@@ -342,21 +362,22 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    if (Dim == 0) {
+    if (Dim == 0)
+    {
       dims.local.x = len;
     }
-    if (Dim == 1) {
+    if (Dim == 1)
+    {
       dims.local.y = len;
     }
-    if (Dim == 2) {
+    if (Dim == 2)
+    {
       dims.local.z = len;
     }
 
@@ -366,21 +387,21 @@ struct SyclStatementExecutor<
   }
 };
 
-
 /*
  * Executor for block work sharing inside SyclKernel.
  * Mapping directly to indicies
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          int Local_Size,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         int Local_Size,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, RAJA::sycl_exec<Local_Size>, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -392,13 +413,13 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item)
+  static inline RAJA_DEVICE void exec(Data& data, cl::sycl::nd_item<3> item)
   {
     auto len = segment_length<ArgumentId>(data);
-    auto i = item.get_global_id(0);
+    auto i   = item.get_global_id(0);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -408,16 +429,13 @@ struct SyclStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     auto len = segment_length<ArgumentId>(data);
 
     // request one block per element in the segment
     LaunchDims dims;
-    dims.local.x = Local_Size;
+    dims.local.x  = Local_Size;
     dims.global.x = len;
 
     // combine with enclosed statements
@@ -432,14 +450,15 @@ struct SyclStatementExecutor<
  * This is specialized since it need to execute the loop immediately.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
     statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
-    Types> {
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -451,17 +470,18 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
 
-    using idx_type = camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
+    using idx_type =
+        camp::decay<decltype(camp::get<ArgumentId>(data.offset_tuple))>;
 
     idx_type len = segment_length<ArgumentId>(data);
 
-    for(idx_type i = 0;i < len;++ i){
+    for (idx_type i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
 
@@ -470,9 +490,7 @@ struct SyclStatementExecutor<
     }
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     return enclosed_stmts_t::calculateDimensions(data);
   }
@@ -483,4 +501,4 @@ struct SyclStatementExecutor<
 }  // end namespace RAJA
 
 
-#endif 
+#endif
diff --git a/include/RAJA/policy/sycl/kernel/ForICount.hpp b/include/RAJA/policy/sycl/kernel/ForICount.hpp
index 9c25bb0ab9..9d276b6dac 100644
--- a/include/RAJA/policy/sycl/kernel/ForICount.hpp
+++ b/include/RAJA/policy/sycl/kernel/ForICount.hpp
@@ -23,7 +23,6 @@
 
 #include "RAJA/policy/sycl/kernel/internal.hpp"
 
-
 namespace RAJA
 {
 
@@ -31,166 +30,177 @@ namespace internal
 {
 
 
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Mapping directly from local id to indices
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop count to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>, Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_direct<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_direct<ThreadDim>, EnclosedStmts...>,
-        Types>;
+      Data,
+      statement::For<ArgumentId,
+                     RAJA::sycl_local_012_direct<ThreadDim>,
+                     EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_local_id(ThreadDim);
+    auto i     = item.get_local_id(ThreadDim);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
-
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
 };
 
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_direct<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_direct<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_direct<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_direct<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i = mask_t::maskValue(i0);
+    auto i0    = item.get_local_id(0);
+    diff_t i   = mask_t::maskValue(i0);
 
     // assign thread id directly to offset
     data.template assign_offset<ArgumentId>(i);
     data.template assign_param<ParamId>(i);
 
     // execute enclosed statements if in bounds
-    enclosed_stmts_t::exec(data, item, thread_active && (i<len));
+    enclosed_stmts_t::exec(data, item, thread_active && (i < len));
   }
-
 };
 
-
-
-
-
 /*
  * Executor for local work sharing loop inside SyclKernel.
  * Assigns the loop index to offset ArgumentId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename Mask,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename Mask,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::ForICount<ArgumentId, ParamId,
-                       RAJA::sycl_local_masked_loop<Mask>,
-                       EnclosedStmts ...>, Types >
-  : public SyclStatementExecutor<
     Data,
-    statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                   EnclosedStmts ...>, Types > {
-
-  using Base = SyclStatementExecutor<
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
-          statement::For<ArgumentId, RAJA::sycl_local_masked_loop<Mask>,
-                         EnclosedStmts ...>, Types >;
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_masked_loop<Mask>,
+                         EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_masked_loop<Mask>,
+                                           EnclosedStmts...>,
+                            Types>;
 
   using typename Base::diff_t;
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   // Set the argument type for this loop
   using NewTypes = setSegmentTypeFromData<Types, ArgumentId, Data>;
 
   using enclosed_stmts_t =
-          SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
+      SyclStatementListExecutor<Data, stmt_list_t, NewTypes>;
 
   using mask_t = Mask;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // masked size strided loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i0 = item.get_local_id(0);
-    diff_t i_init = mask_t::maskValue(i0);
-    diff_t i_stride = (diff_t) mask_t::max_masked_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    auto i0         = item.get_local_id(0);
+    diff_t i_init   = mask_t::maskValue(i0);
+    diff_t i_stride = (diff_t)mask_t::max_masked_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -205,13 +215,8 @@ struct SyclStatementExecutor<
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
     }
   }
-
 };
 
-
-
-
-
 /*
  * Executor for thread work sharing loop inside SyclKernel.
  * Provides a block-stride loop (stride of blockDim.xyz) for
@@ -219,39 +224,49 @@ struct SyclStatementExecutor<
  * Assigns the loop iterate to offset ArgumentId
  * Assigns the loop offset to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int ThreadDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_local_012_loop<ThreadDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_local_012_loop<ThreadDim>, EnclosedStmts...>,
-        Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_local_012_loop<ThreadDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // block stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_local_id(ThreadDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_local_id(ThreadDim);
     auto i_stride = item.get_local_range(ThreadDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -268,45 +283,53 @@ struct SyclStatementExecutor<
   }
 };
 
-
-
 /*
  * Executor for group work sharing inside SyclKernel.
  * Provides a direct mapping of each block in 012.
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_direct<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_direct<BlockDim>, EnclosedStmts...>,
+      statement::For<ArgumentId,
+                     RAJA::sycl_group_012_direct<BlockDim>,
+                     EnclosedStmts...>,
       Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // grid stride loop
     diff_t len = segment_length<ArgumentId>(data);
-    auto i = item.get_group(BlockDim);
+    auto i     = item.get_group(BlockDim);
 
-    if (i < len) {
+    if (i < len)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -325,39 +348,49 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
+    statement::ForICount<ArgumentId,
+                         ParamId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
     Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-        Types> {
+          Data,
+          statement::For<ArgumentId,
+                         RAJA::sycl_group_012_loop<BlockDim>,
+                         EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::For<ArgumentId, RAJA::sycl_group_012_loop<BlockDim>, EnclosedStmts...>,
-      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::For<ArgumentId,
+                                           RAJA::sycl_group_012_loop<BlockDim>,
+                                           EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // grid stride loop
-    diff_t len = segment_length<ArgumentId>(data);
-    auto i_init = item.get_group(BlockDim);
+    diff_t len    = segment_length<ArgumentId>(data);
+    auto i_init   = item.get_group(BlockDim);
     auto i_stride = item.get_group_range(BlockDim);
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign the x thread to the argument
       data.template assign_offset<ArgumentId>(i);
@@ -369,7 +402,6 @@ struct SyclStatementExecutor<
   }
 };
 
-
 /*
  * Executor for sequential loops inside of a SyclKernel.
  *
@@ -377,33 +409,37 @@ struct SyclStatementExecutor<
  * Assigns the loop index to offset ArgumentId
  * Assigns the loop index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>, Types >
+    statement::ForICount<ArgumentId, ParamId, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types > {
+          Data,
+          statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::For<ArgumentId, seq_exec, EnclosedStmts...>, Types >;
+      statement::For<ArgumentId, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     diff_t len = segment_length<ArgumentId>(data);
 
-    for(diff_t i = 0;i < len;++ i){
+    for (diff_t i = 0; i < len; ++i)
+    {
       // Assign i to the argument
       data.template assign_offset<ArgumentId>(i);
       data.template assign_param<ParamId>(i);
@@ -415,9 +451,6 @@ struct SyclStatementExecutor<
 };
 
 
-
-
-
 }  // namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/Lambda.hpp b/include/RAJA/policy/sycl/kernel/Lambda.hpp
index 0542f4b81e..3e005aa084 100644
--- a/include/RAJA/policy/sycl/kernel/Lambda.hpp
+++ b/include/RAJA/policy/sycl/kernel/Lambda.hpp
@@ -34,7 +34,6 @@
 #include "RAJA/pattern/kernel.hpp"
 #include "RAJA/pattern/kernel/Lambda.hpp"
 
-
 namespace RAJA
 {
 namespace internal
@@ -42,22 +41,29 @@ namespace internal
 
 // SyclStatementExecutor for actually invoking the lambda
 
-template <typename Data, camp::idx_t LambdaIndex, typename... Args, typename Types>
-struct SyclStatementExecutor<Data, statement::Lambda<LambdaIndex, Args...>, Types> {
+template<typename Data,
+         camp::idx_t LambdaIndex,
+         typename... Args,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Lambda<LambdaIndex, Args...>,
+                             Types>
+{
 
-  static
-  inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Only execute the lambda if it hasn't been masked off
-    if(thread_active){
-      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(data);
+    if (thread_active)
+    {
+      StatementExecutor<statement::Lambda<LambdaIndex, Args...>, Types>::exec(
+          data);
     }
-
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const & RAJA_UNUSED_ARG(data))
+  static inline LaunchDims calculateDimensions(
+      Data const& RAJA_UNUSED_ARG(data))
   {
     return LaunchDims();
   }
diff --git a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
index 88c789c062..0882cba92b 100644
--- a/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
+++ b/include/RAJA/policy/sycl/kernel/SyclKernel.hpp
@@ -47,13 +47,13 @@ namespace RAJA
  * SYCL kernel launch policy where the user may specify the number of physical
  * work group and work items per group.
  */
-template <bool async0>
+template<bool async0>
 struct sycl_launch : public RAJA::make_policy_pattern_launch_platform_t<
-                            RAJA::Policy::sycl,
-                            RAJA::Pattern::forall,
-                            detail::get_launch<async0>::value,
-                            RAJA::Platform::sycl>{
-};
+                         RAJA::Policy::sycl,
+                         RAJA::Pattern::forall,
+                         detail::get_launch<async0>::value,
+                         RAJA::Platform::sycl>
+{};
 
 namespace statement
 {
@@ -61,30 +61,26 @@ namespace statement
 /*
  * ! RAJA::kernel statement that launches a SYCL kernel.
  */
-template <typename LaunchConfig, typename... EnclosedStmts>
+template<typename LaunchConfig, typename... EnclosedStmts>
 struct SyclKernelExt
-    : public internal::Statement<LaunchConfig, EnclosedStmts...> {
-};
+    : public internal::Statement<LaunchConfig, EnclosedStmts...>
+{};
 
 /*
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is synchronous.
  */
-template <typename... EnclosedStmts>
-using SyclKernel =
-    SyclKernelExt<sycl_launch<false>,
-                  EnclosedStmts...>;
+template<typename... EnclosedStmts>
+using SyclKernel = SyclKernelExt<sycl_launch<false>, EnclosedStmts...>;
 
 /*!
  * A RAJA::kernel statement that launches a SYCL kernel.
  * The kernel launch is asynchronous.
  */
-template <typename... EnclosedStmts>
-using SyclKernelAsync =
-    SyclKernelExt<sycl_launch<true>,
-                  EnclosedStmts...>;
+template<typename... EnclosedStmts>
+using SyclKernelAsync = SyclKernelExt<sycl_launch<true>, EnclosedStmts...>;
 
-} // namespace statement
+}  // namespace statement
 
 namespace internal
 {
@@ -92,11 +88,11 @@ namespace internal
 /*!
  * SYCL global function for launching SyclKernel policies.
  */
-template <typename Data, typename Exec>
+template<typename Data, typename Exec>
 void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
 {
 
-  using data_t = camp::decay<Data>;
+  using data_t        = camp::decay<Data>;
   data_t private_data = data;
 
   // execute the the object
@@ -107,7 +103,11 @@ void SyclKernelLauncher(Data data, cl::sycl::nd_item<3> item)
  * Helper class that handles SYCL kernel launching, and computing
  * maximum number of threads/blocks
  */
-template<bool IsTriviallyCopyable, typename LaunchPolicy, typename StmtList, typename Data, typename Types>
+template<bool IsTriviallyCopyable,
+         typename LaunchPolicy,
+         typename StmtList,
+         typename Data,
+         typename Types>
 struct SyclLaunchHelper;
 
 /*!
@@ -116,16 +116,17 @@ struct SyclLaunchHelper;
  * determined at runtime using the SYCL occupancy calculator.
  */
 template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
+struct SyclLaunchHelper<false, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
@@ -136,21 +137,17 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
     // Kernel body is nontrivially copyable, create space on device and copy to
     // Workaround until "is_device_copyable" is supported
     //
-    data_t* m_data = (data_t*) cl::sycl::malloc_device(sizeof(data_t), *qu);
+    data_t* m_data = (data_t*)cl::sycl::malloc_device(sizeof(data_t), *qu);
     qu->memcpy(m_data, &data, sizeof(data_t)).wait();
 
     qu->submit([&](cl::sycl::handler& h) {
- 
-      h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-        
-        SyclKernelLauncher<Data, executor_t>(*m_data, item);
-
-      });
-    }).wait(); // Need to wait to free memory
+        h.parallel_for(launch_dims.fit_nd_range(qu),
+                       [=](cl::sycl::nd_item<3> item) {
+                         SyclKernelLauncher<Data, executor_t>(*m_data, item);
+                       });
+      }).wait();  // Need to wait to free memory
 
     cl::sycl::free(m_data, *qu);
-
   }
 };
 
@@ -160,73 +157,75 @@ struct SyclLaunchHelper<false,sycl_launch<async0>,StmtList,Data,Types>
  * determined at runtime using the SYCL occupancy calculator.
  */
 template<bool async0, typename StmtList, typename Data, typename Types>
-struct SyclLaunchHelper<true,sycl_launch<async0>,StmtList,Data,Types>
+struct SyclLaunchHelper<true, sycl_launch<async0>, StmtList, Data, Types>
 {
   using Self = SyclLaunchHelper;
 
   static constexpr bool async = async0;
 
-  using executor_t = internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
+  using executor_t =
+      internal::sycl_statement_list_executor_t<StmtList, Data, Types>;
   using data_t = camp::decay<Data>;
 
-  static void launch(Data &&data,
+  static void launch(Data&& data,
                      internal::LaunchDims launch_dims,
                      size_t shmem,
                      cl::sycl::queue* qu)
   {
 
     qu->submit([&](cl::sycl::handler& h) {
- 
       h.parallel_for(launch_dims.fit_nd_range(qu),
-                     [=] (cl::sycl::nd_item<3> item) {
-
-        SyclKernelLauncher<Data, executor_t>(data, item);
-
-      });
+                     [=](cl::sycl::nd_item<3> item) {
+                       SyclKernelLauncher<Data, executor_t>(data, item);
+                     });
     });
 
-    if (!async) { qu->wait(); };
-
+    if (!async)
+    {
+      qu->wait();
+    };
   }
 };
 
 /*!
  * Specialization that launches SYCL kernels for RAJA::kernel from host code
  */
-template <typename LaunchConfig, typename... EnclosedStmts, typename Types>
+template<typename LaunchConfig, typename... EnclosedStmts, typename Types>
 struct StatementExecutor<
-    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>, Types> {
+    statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>,
+    Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
   using StatementType =
       statement::SyclKernelExt<LaunchConfig, EnclosedStmts...>;
 
-  template <typename Data>
-  static inline void exec(Data &&data)
+  template<typename Data>
+  static inline void exec(Data&& data)
   {
 
     using data_t = camp::decay<Data>;
-    using executor_t = sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
+    using executor_t =
+        sycl_statement_list_executor_t<stmt_list_t, data_t, Types>;
     using launch_t = SyclLaunchHelper<std::is_trivially_copyable<data_t>::value,
                                       LaunchConfig, stmt_list_t, data_t, Types>;
 
     camp::resources::Sycl res = data.get_resource();
-    ::sycl::queue* q = res.get_queue();;
+    ::sycl::queue* q          = res.get_queue();
+    ;
 
     //
     // Compute the requested kernel dimensions
     //
     LaunchDims launch_dims = executor_t::calculateDimensions(data);
-    
+
     int shmem = 0;
 
     //
     // Launch the kernels
     //
     launch_t::launch(std::move(data), launch_dims, shmem, q);
-
   }
-
 };
 
 
diff --git a/include/RAJA/policy/sycl/kernel/Tile.hpp b/include/RAJA/policy/sycl/kernel/Tile.hpp
index 81a57cdecb..76db101791 100644
--- a/include/RAJA/policy/sycl/kernel/Tile.hpp
+++ b/include/RAJA/policy/sycl/kernel/Tile.hpp
@@ -1,12 +1,12 @@
- /*!
- ******************************************************************************
- *
- * \file
- *
- * \brief   Header file for SYCL tiled executors.
- *
- ******************************************************************************
- */
+/*!
+******************************************************************************
+*
+* \file
+*
+* \brief   Header file for SYCL tiled executors.
+*
+******************************************************************************
+*/
 
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
@@ -47,26 +47,30 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
 {
 
-  using stmt_list_t = StatementList<EnclosedStmts...>;
+  using stmt_list_t      = StatementList<EnclosedStmts...>;
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
-  using diff_t = segment_diff_type<ArgumentId, Data>;
+  using diff_t           = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -75,7 +79,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0; i < len; i += chunk_size) {
+    for (diff_t i = 0; i < len; i += chunk_size)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -88,18 +93,15 @@ struct SyclStatementExecutor<
     segment = orig_segment;
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, TPol::chunk_size);
@@ -112,26 +114,24 @@ struct SyclStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
-  {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_direct<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -139,20 +139,25 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
-    diff_t i = item.get_group(BlockDim) * chunk_size;//get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    // diff_t i = get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
+    diff_t i =
+        item.get_group(BlockDim) *
+        chunk_size;  // get_sycl_dim<BlockDim>(blockIdx) * chunk_size; // TODO
 
     // check have chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -168,16 +173,14 @@ struct SyclStatementExecutor<
     }
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -189,11 +192,11 @@ struct SyclStatementExecutor<
 
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -211,19 +214,19 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-    Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>, Types>
-  {
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_group_012_loop<BlockDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
   using stmt_list_t = StatementList<EnclosedStmts...>;
 
@@ -231,22 +234,25 @@ struct SyclStatementExecutor<
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t i_init = item.get_group(BlockDim) * chunk_size; // TODO
-    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size; // TODO
+    diff_t len      = segment.end() - segment.begin();
+    diff_t i_init   = item.get_group(BlockDim) * chunk_size;        // TODO
+    diff_t i_stride = item.get_group_range(BlockDim) * chunk_size;  // TODO
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init; i < len; i += i_stride) {
+    for (diff_t i = i_init; i < len; i += i_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -259,16 +265,14 @@ struct SyclStatementExecutor<
     segment = orig_segment;
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len        = segment_length<ArgumentId>(data);
     diff_t num_blocks = len / chunk_size;
-    if (num_blocks * chunk_size < len) {
+    if (num_blocks * chunk_size < len)
+    {
       num_blocks++;
     }
 
@@ -276,13 +280,12 @@ struct SyclStatementExecutor<
     set_sycl_dim<BlockDim>(dims.group, num_blocks);
 
 
-
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
@@ -295,44 +298,45 @@ struct SyclStatementExecutor<
   }
 };
 
-
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int ThreadDim,
-          typename ... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_direct<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_direct<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    diff_t i = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t i   = item.get_local_id(ThreadDim) * chunk_size;
 
     // execute enclosed statements if any thread will
     // but mask off threads without work
@@ -340,7 +344,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
 
     // execute enclosed statements
     enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -349,16 +353,14 @@ struct SyclStatementExecutor<
     segment = orig_segment;
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
 
@@ -367,64 +369,67 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, num_threads);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::Tile
  * Assigns the tile segment to segment ArgumentId
  *
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          camp::idx_t chunk_size,
-          int ThreadDim,
-          typename ... EnclosedStmts,
-          typename Types>
-struct SyclStatementExecutor<
-  Data,
-  statement::Tile<ArgumentId,
-                  RAJA::tile_fixed<chunk_size>,
-                  sycl_local_012_loop<ThreadDim>,
-                  EnclosedStmts ...>, Types>{
+template<typename Data,
+         camp::idx_t ArgumentId,
+         camp::idx_t chunk_size,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
+struct SyclStatementExecutor<Data,
+                             statement::Tile<ArgumentId,
+                                             RAJA::tile_fixed<chunk_size>,
+                                             sycl_local_012_loop<ThreadDim>,
+                                             EnclosedStmts...>,
+                             Types>
+{
 
-  using stmt_list_t = StatementList<EnclosedStmts ...>;
+  using stmt_list_t = StatementList<EnclosedStmts...>;
 
   using enclosed_stmts_t = SyclStatementListExecutor<Data, stmt_list_t, Types>;
 
   using diff_t = segment_diff_type<ArgumentId, Data>;
 
-  static inline RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment_length<ArgumentId>(data);
-    diff_t i_init = item.get_local_id(ThreadDim) * chunk_size;
+    diff_t len      = segment_length<ArgumentId>(data);
+    diff_t i_init   = item.get_local_id(ThreadDim) * chunk_size;
     diff_t i_stride = item.get_group_range(ThreadDim) * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t ii = 0; ii < len; ii += i_stride) {
+    for (diff_t ii = 0; ii < len; ii += i_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -433,7 +438,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
 
       // execute enclosed statements
       enclosed_stmts_t::exec(data, item, thread_active && have_work);
@@ -443,16 +448,14 @@ struct SyclStatementExecutor<
     segment = orig_segment;
   }
 
-
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
 
     // Compute how many blocks
-    diff_t len = segment_length<ArgumentId>(data);
+    diff_t len         = segment_length<ArgumentId>(data);
     diff_t num_threads = len / chunk_size;
-    if(num_threads * chunk_size < len){
+    if (num_threads * chunk_size < len)
+    {
       num_threads++;
     }
     num_threads = std::max(num_threads, (diff_t)1);
@@ -462,26 +465,24 @@ struct SyclStatementExecutor<
     set_sycl_dim<ThreadDim>(dims.min_locals, 1);
 
     // privatize data, so we can mess with the segments
-    using data_t = camp::decay<Data>;
+    using data_t        = camp::decay<Data>;
     data_t private_data = data;
 
     // Get original segment
-    auto &segment = camp::get<ArgumentId>(private_data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(private_data.segment_tuple);
 
     // restrict to first tile
     segment = segment.slice(0, chunk_size);
 
 
     LaunchDims enclosed_dims =
-      enclosed_stmts_t::calculateDimensions(private_data);
+        enclosed_stmts_t::calculateDimensions(private_data);
 
-    return(dims.max(enclosed_dims));
+    return (dims.max(enclosed_dims));
   }
 };
 
 
-
-
 }  // end namespace internal
 }  // end namespace RAJA
 
diff --git a/include/RAJA/policy/sycl/kernel/TileTCount.hpp b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
index b1d263a263..ef590724fe 100644
--- a/include/RAJA/policy/sycl/kernel/TileTCount.hpp
+++ b/include/RAJA/policy/sycl/kernel/TileTCount.hpp
@@ -47,35 +47,40 @@ namespace internal
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          typename TPol,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         typename TPol,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>, Types>
+    statement::
+        TileTCount<ArgumentId, ParamId, TPol, seq_exec, EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types> {
+          Data,
+          statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+          Types>
+{
 
   using Base = SyclStatementExecutor<
       Data,
-      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>, Types>;
+      statement::Tile<ArgumentId, TPol, seq_exec, EnclosedStmts...>,
+      Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active){
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
+  {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     diff_t chunk_size = TPol::chunk_size;
@@ -84,7 +89,8 @@ struct SyclStatementExecutor<
     diff_t len = segment.end() - segment.begin();
 
     // Iterate through tiles
-    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t) {
+    for (diff_t i = 0, t = 0; i < len; i += chunk_size, ++t)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -99,63 +105,64 @@ struct SyclStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_direct<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_direct<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_direct<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_direct<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_direct<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     using segment_t = camp::decay<decltype(segment)>;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<BlockDim>(blockIdx);
+    // diff_t t = get_sycl_dim<BlockDim>(blockIdx);
     diff_t t = item.get_group(BlockDim);
     diff_t i = t * chunk_size;
 
     // check have a chunk
-    if (i < len) {
+    if (i < len)
+    {
 
       // Keep copy of original segment, so we can restore it
       segment_t orig_segment = segment;
@@ -178,60 +185,62 @@ struct SyclStatementExecutor<
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int BlockDim,
-          typename... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         int BlockDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
     Data,
-    statement::TileTCount<ArgumentId, ParamId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_group_012_loop<BlockDim>,
-                    EnclosedStmts...>,
-                    Types>
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+    Types>
     : public SyclStatementExecutor<
-        Data,
-        statement::Tile<ArgumentId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_group_012_loop<BlockDim>,
-                        EnclosedStmts...>,
-                        Types> {
+          Data,
+          statement::Tile<ArgumentId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_group_012_loop<BlockDim>,
+                          EnclosedStmts...>,
+          Types>
+{
 
-  using Base = SyclStatementExecutor<
-      Data,
-      statement::Tile<ArgumentId,
-                      RAJA::tile_fixed<chunk_size>,
-                      sycl_group_012_loop<BlockDim>,
-                      EnclosedStmts...>,
-                      Types>;
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_group_012_loop<BlockDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
-    diff_t len = segment.end() - segment.begin();
-    diff_t t_init = item.get_group(BlockDim);
-    diff_t i_init = t_init * chunk_size;
+    diff_t len      = segment.end() - segment.begin();
+    diff_t t_init   = item.get_group(BlockDim);
+    diff_t i_init   = t_init * chunk_size;
     diff_t t_stride = item.get_group_range(BlockDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride) {
+    for (diff_t i = i_init, t = t_init; i < len; i += i_stride, t += t_stride)
+    {
 
       // Assign our new tiled segment
       segment = orig_segment.slice(i, chunk_size);
@@ -246,61 +255,60 @@ struct SyclStatementExecutor<
   }
 };
 
-
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int ThreadDim,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_direct<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_direct<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_direct<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_direct<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_direct<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment.end() - segment.begin();
-    //diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
+    // diff_t t = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t = item.get_local_id(ThreadDim);
     diff_t i = t * chunk_size;
 
@@ -310,7 +318,7 @@ struct SyclStatementExecutor<
 
     // Assign our new tiled segment
     diff_t slice_size = have_work ? chunk_size : 0;
-    segment = orig_segment.slice(i, slice_size);
+    segment           = orig_segment.slice(i, slice_size);
     data.template assign_param<ParamId>(t);
 
     // execute enclosed statements
@@ -321,68 +329,69 @@ struct SyclStatementExecutor<
   }
 };
 
-
 /*!
  * A specialized RAJA::kernel sycl_impl executor for statement::TileTCount
  * Assigns the tile segment to segment ArgumentId
  * Assigns the tile index to param ParamId
  */
-template <typename Data,
-          camp::idx_t ArgumentId,
-          typename ParamId,
-          camp::idx_t chunk_size,
-          int ThreadDim,
-          typename ... EnclosedStmts,
-          typename Types>
+template<typename Data,
+         camp::idx_t ArgumentId,
+         typename ParamId,
+         camp::idx_t chunk_size,
+         int ThreadDim,
+         typename... EnclosedStmts,
+         typename Types>
 struct SyclStatementExecutor<
-  Data,
-  statement::TileTCount<ArgumentId, ParamId,
-                        RAJA::tile_fixed<chunk_size>,
-                        sycl_local_012_loop<ThreadDim>,
-                        EnclosedStmts ...>,
-                        Types>
-  : public SyclStatementExecutor<
     Data,
-    statement::Tile<ArgumentId,
-                    RAJA::tile_fixed<chunk_size>,
-                    sycl_local_012_loop<ThreadDim>,
-                    EnclosedStmts ...>,
-                    Types> {
-
-  using Base = SyclStatementExecutor<
+    statement::TileTCount<ArgumentId,
+                          ParamId,
+                          RAJA::tile_fixed<chunk_size>,
+                          sycl_local_012_loop<ThreadDim>,
+                          EnclosedStmts...>,
+    Types>
+    : public SyclStatementExecutor<
           Data,
           statement::Tile<ArgumentId,
                           RAJA::tile_fixed<chunk_size>,
                           sycl_local_012_loop<ThreadDim>,
-                          EnclosedStmts ...>,
-                          Types>;
+                          EnclosedStmts...>,
+          Types>
+{
+
+  using Base =
+      SyclStatementExecutor<Data,
+                            statement::Tile<ArgumentId,
+                                            RAJA::tile_fixed<chunk_size>,
+                                            sycl_local_012_loop<ThreadDim>,
+                                            EnclosedStmts...>,
+                            Types>;
 
-  using typename Base::enclosed_stmts_t;
   using typename Base::diff_t;
+  using typename Base::enclosed_stmts_t;
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Get the segment referenced by this Tile statement
-    auto &segment = camp::get<ArgumentId>(data.segment_tuple);
+    auto& segment = camp::get<ArgumentId>(data.segment_tuple);
 
     // Keep copy of original segment, so we can restore it
-    using segment_t = camp::decay<decltype(segment)>;
+    using segment_t        = camp::decay<decltype(segment)>;
     segment_t orig_segment = segment;
 
     // compute trip count
     diff_t len = segment_length<ArgumentId>(data);
-//    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
+    //    diff_t t_init = get_sycl_dim<ThreadDim>(threadIdx);
     diff_t t_init = item.get_local_id(ThreadDim);
     diff_t i_init = t_init * chunk_size;
-//    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
+    //    diff_t t_stride = get_sycl_dim<ThreadDim>(blockDim);
     diff_t t_stride = item.get_local_range(ThreadDim);
     diff_t i_stride = t_stride * chunk_size;
 
     // Iterate through grid stride of chunks
-    for(diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride) {
+    for (diff_t ii = 0, t = t_init; ii < len; ii += i_stride, t += t_stride)
+    {
       diff_t i = ii + i_init;
 
       // execute enclosed statements if any thread will
@@ -391,7 +400,7 @@ struct SyclStatementExecutor<
 
       // Assign our new tiled segment
       diff_t slice_size = have_work ? chunk_size : 0;
-      segment = orig_segment.slice(i, slice_size);
+      segment           = orig_segment.slice(i, slice_size);
       data.template assign_param<ParamId>(t);
 
       // execute enclosed statements
diff --git a/include/RAJA/policy/sycl/kernel/internal.hpp b/include/RAJA/policy/sycl/kernel/internal.hpp
index 56e3a9aa1e..36f188fcce 100644
--- a/include/RAJA/policy/sycl/kernel/internal.hpp
+++ b/include/RAJA/policy/sycl/kernel/internal.hpp
@@ -43,7 +43,8 @@ namespace internal
 {
 
 // LaunchDims and Helper functions
-struct LaunchDims {
+struct LaunchDims
+{
   sycl_dim_3_t group;
   sycl_dim_3_t local;
   sycl_dim_3_t global;
@@ -51,23 +52,27 @@ struct LaunchDims {
   sycl_dim_3_t min_locals;
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  LaunchDims() : group{0,0,0},
-                 local{1,1,1},
-                 global{1,1,1},
-                 min_groups{0,0,0},
-                 min_locals{0,0,0} {}
+  LaunchDims()
+      : group {0, 0, 0},
+        local {1, 1, 1},
+        global {1, 1, 1},
+        min_groups {0, 0, 0},
+        min_locals {0, 0, 0}
+  {}
 
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  LaunchDims(LaunchDims const &c) : group(c.group),
-                                    local(c.local),
-                                    global(c.global)
-  {
-  }
+  LaunchDims(LaunchDims const& c)
+      : group(c.group),
+        local(c.local),
+        global(c.global)
+  {}
 
   RAJA_INLINE
-  LaunchDims max(LaunchDims const &c) const
+  LaunchDims max(LaunchDims const& c) const
   {
     LaunchDims result;
 
@@ -86,97 +91,125 @@ struct LaunchDims {
     return result;
   }
 
-  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q) {
+  cl::sycl::nd_range<3> fit_nd_range(::sycl::queue* q)
+  {
 
     sycl_dim_3_t launch_global;
 
-    sycl_dim_3_t launch_local {1,1,1};
-    launch_local.x = std::max(launch_local.x, local.x); 
+    sycl_dim_3_t launch_local {1, 1, 1};
+    launch_local.x = std::max(launch_local.x, local.x);
     launch_local.y = std::max(launch_local.y, local.y);
     launch_local.z = std::max(launch_local.z, local.z);
 
     cl::sycl::device dev = q->get_device();
 
-    auto max_work_group_size = dev.get_info< ::cl::sycl::info::device::max_work_group_size>();
+    auto max_work_group_size =
+        dev.get_info<::cl::sycl::info::device::max_work_group_size>();
 
-    if(launch_local.x > max_work_group_size) {
+    if (launch_local.x > max_work_group_size)
+    {
       launch_local.x = max_work_group_size;
     }
-    if(launch_local.y > max_work_group_size) {
+    if (launch_local.y > max_work_group_size)
+    {
       launch_local.y = max_work_group_size;
     }
-    if(launch_local.z > max_work_group_size) {
+    if (launch_local.z > max_work_group_size)
+    {
       launch_local.z = max_work_group_size;
     }
 
 
     // Make sure the multiple of locals fits
     // Prefer larger z -> y -> x
-    if(launch_local.x * launch_local.y * launch_local.z > max_work_group_size) {
+    if (launch_local.x * launch_local.y * launch_local.z > max_work_group_size)
+    {
       int remaining = 1;
       // local z cannot be > max_wrk from above
-      // if equal then remaining is 1, on handle < 
-      if(max_work_group_size > launch_local.z) {
+      // if equal then remaining is 1, on handle <
+      if (max_work_group_size > launch_local.z)
+      {
         // keep local z
         remaining = max_work_group_size / launch_local.z;
       }
-      if(remaining >= launch_local.y) {
+      if (remaining >= launch_local.y)
+      {
         // keep local y
         remaining = remaining / launch_local.y;
-      } else {
+      }
+      else
+      {
         launch_local.y = remaining;
-        remaining = remaining / launch_local.y;
+        remaining      = remaining / launch_local.y;
       }
-      if(remaining < launch_local.x) {
+      if (remaining < launch_local.x)
+      {
         launch_local.x = remaining;
       }
     }
 
 
     // User gave group policy, use to calculate global space
-    if (group.x != 0 || group.y != 0 || group.z != 0) {
-      sycl_dim_3_t launch_group {1,1,1};
+    if (group.x != 0 || group.y != 0 || group.z != 0)
+    {
+      sycl_dim_3_t launch_group {1, 1, 1};
       launch_group.x = std::max(launch_group.x, group.x);
       launch_group.y = std::max(launch_group.y, group.y);
       launch_group.z = std::max(launch_group.z, group.z);
 
       launch_global.x = launch_local.x * launch_group.x;
-      launch_global.y = launch_local.y * launch_group.y; 
+      launch_global.y = launch_local.y * launch_group.y;
       launch_global.z = launch_local.z * launch_group.z;
-    } else {
-      launch_global.x = launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
-      launch_global.y = launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
-      launch_global.z = launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
+    }
+    else
+    {
+      launch_global.x =
+          launch_local.x * ((global.x + (launch_local.x - 1)) / launch_local.x);
+      launch_global.y =
+          launch_local.y * ((global.y + (launch_local.y - 1)) / launch_local.y);
+      launch_global.z =
+          launch_local.z * ((global.z + (launch_local.z - 1)) / launch_local.z);
     }
 
 
-    if(launch_global.x % launch_local.x != 0) {
-      launch_global.x = ((launch_global.x / launch_local.x) + 1) * launch_local.x; 
+    if (launch_global.x % launch_local.x != 0)
+    {
+      launch_global.x =
+          ((launch_global.x / launch_local.x) + 1) * launch_local.x;
     }
-    if(launch_global.y % launch_local.y != 0) {
-      launch_global.y = ((launch_global.y / launch_local.y) + 1) * launch_local.y; 
+    if (launch_global.y % launch_local.y != 0)
+    {
+      launch_global.y =
+          ((launch_global.y / launch_local.y) + 1) * launch_local.y;
     }
-    if(launch_global.z % launch_local.z != 0) {
-      launch_global.z = ((launch_global.z / launch_local.z) + 1) * launch_local.z; 
+    if (launch_global.z % launch_local.z != 0)
+    {
+      launch_global.z =
+          ((launch_global.z / launch_local.z) + 1) * launch_local.z;
     }
 
-    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y, launch_local.z};
-    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y, launch_global.z};
+    cl::sycl::range<3> ret_th = {launch_local.x, launch_local.y,
+                                 launch_local.z};
+    cl::sycl::range<3> ret_gl = {launch_global.x, launch_global.y,
+                                 launch_global.z};
 
     return cl::sycl::nd_range<3>(ret_gl, ret_th);
   }
 };
 
-template <camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper {
+template<camp::idx_t cur_stmt, camp::idx_t num_stmts, typename StmtList>
+struct SyclStatementListExecutorHelper
+{
 
   using next_helper_t =
       SyclStatementListExecutorHelper<cur_stmt + 1, num_stmts, StmtList>;
 
   using cur_stmt_t = camp::at_v<StmtList, cur_stmt>;
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Execute stmt
     cur_stmt_t::exec(data, item, thread_active);
@@ -185,8 +218,8 @@ struct SyclStatementListExecutorHelper {
     next_helper_t::exec(data, item, thread_active);
   }
 
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &data)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data& data)
   {
     // Compute this statements launch dimensions
     LaunchDims statement_dims = cur_stmt_t::calculateDimensions(data);
@@ -199,61 +232,58 @@ struct SyclStatementListExecutorHelper {
   }
 };
 
-template <camp::idx_t num_stmts, typename StmtList>
-struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList> {
+template<camp::idx_t num_stmts, typename StmtList>
+struct SyclStatementListExecutorHelper<num_stmts, num_stmts, StmtList>
+{
 
-  template <typename Data>
-  inline static RAJA_DEVICE void exec(Data &, cl::sycl::nd_item<3> item, bool)
+  template<typename Data>
+  inline static RAJA_DEVICE void exec(Data&, cl::sycl::nd_item<3> item, bool)
   {
     // nop terminator
   }
 
-  template <typename Data>
-  inline static LaunchDims calculateDimensions(Data &)
+  template<typename Data>
+  inline static LaunchDims calculateDimensions(Data&)
   {
     return LaunchDims();
   }
 };
 
-template <typename Data, typename Policy, typename Types>
+template<typename Data, typename Policy, typename Types>
 struct SyclStatementExecutor;
 
-template <typename Data, typename StmtList, typename Types>
+template<typename Data, typename StmtList, typename Types>
 struct SyclStatementListExecutor;
 
-
-template <typename Data, typename... Stmts, typename Types>
-struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types> {
+template<typename Data, typename... Stmts, typename Types>
+struct SyclStatementListExecutor<Data, StatementList<Stmts...>, Types>
+{
 
   using enclosed_stmts_t =
       camp::list<SyclStatementExecutor<Data, Stmts, Types>...>;
 
   static constexpr size_t num_stmts = sizeof...(Stmts);
 
-  static
-  inline
-  RAJA_DEVICE
-  void exec(Data &data, cl::sycl::nd_item<3> item, bool thread_active)
+  static inline RAJA_DEVICE void exec(Data& data,
+                                      cl::sycl::nd_item<3> item,
+                                      bool thread_active)
   {
     // Execute statements in order with helper class
-    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(data, item, thread_active);
+    SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::exec(
+        data, item, thread_active);
   }
 
-  static
-  inline
-  LaunchDims calculateDimensions(Data const &data)
+  static inline LaunchDims calculateDimensions(Data const& data)
   {
     // Compute this statements launch dimensions
-    return SyclStatementListExecutorHelper<0, num_stmts, enclosed_stmts_t>::
-        calculateDimensions(data);
+    return SyclStatementListExecutorHelper<
+        0, num_stmts, enclosed_stmts_t>::calculateDimensions(data);
   }
 };
 
-template <typename StmtList, typename Data, typename Types>
-using sycl_statement_list_executor_t = SyclStatementListExecutor<
-    Data,
-    StmtList,
-    Types>;
+template<typename StmtList, typename Data, typename Types>
+using sycl_statement_list_executor_t =
+    SyclStatementListExecutor<Data, StmtList, Types>;
 
 }  // namespace internal
 }  // namespace RAJA
diff --git a/include/RAJA/policy/sycl/launch.hpp b/include/RAJA/policy/sycl/launch.hpp
index ad9fecc222..78804771ca 100644
--- a/include/RAJA/policy/sycl/launch.hpp
+++ b/include/RAJA/policy/sycl/launch.hpp
@@ -22,23 +22,30 @@
 #include "RAJA/pattern/detail/privatizer.hpp"
 #include "RAJA/policy/sycl/policy.hpp"
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
-//#include "RAJA/policy/sycl/raja_syclerrchk.hpp"
+// #include "RAJA/policy/sycl/raja_syclerrchk.hpp"
 #include "RAJA/util/resource.hpp"
 
 namespace RAJA
 {
 
-template <bool async>
-struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
-
- //If the launch lambda is trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+template<bool async>
+struct LaunchExecute<RAJA::sycl_launch_t<async, 0>>
+{
+
+  // If the launch lambda is trivially copyable
+  template<typename BODY_IN,
+           typename ReduceParams,
+           typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                   bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -49,57 +56,67 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       q->submit([&](cl::sycl::handler& h) {
+        auto s_vec = ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
+        h.parallel_for(
+            cl::sycl::nd_range<3>(gridSize, blockSize),
+            [=](cl::sycl::nd_item<3> itm) {
+              LaunchContext ctx;
+              ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            body_in(ctx);
-
-           });
+              // Point to shared memory
+              ctx.shared_mem_ptr =
+                  s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
+              body_in(ctx);
+            });
       });
 
-    if (!async) { q->wait(); }
+      if (!async)
+      {
+        q->wait();
+      }
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
- //If the launch lambda is trivially copyable and we have explcit reduction parameters
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-  exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is trivially copyable and we have explcit reduction
+  // parameters
+  template<typename BODY_IN,
+           typename ReduceParams,
+           typename std::enable_if<std::is_trivially_copyable<BODY_IN> {},
+                                   bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -112,57 +129,60 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
-
-            LaunchContext ctx;
-            ctx.itm = &itm;
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+             [=](cl::sycl::nd_item<3> itm, auto& red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            RAJA::expt::invoke_body(fp, body_in, ctx);
+               RAJA::expt::invoke_body(fp, body_in, ctx);
 
-            red.combine(fp);
+               red.combine(fp);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
 
       RAJA_FT_END;
@@ -170,17 +190,23 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
 
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
-  exec(RAJA::resources::Resource res, const LaunchParams &params, const char *kernel_name,
-       BODY_IN &&body_in, ReduceParams &RAJA_UNUSED_ARG(launch_reducers))
+  // If the launch lambda is not trivially copyable
+  template<typename BODY_IN,
+           typename ReduceParams,
+           typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                   bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams& RAJA_UNUSED_ARG(launch_reducers))
   {
 
     /*Get the queue from concrete resource */
@@ -191,67 +217,73 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     //
 
     const ::sycl::range<3> blockSize(params.threads.value[2],
-				     params.threads.value[1],
-				     params.threads.value[0]);
+                                     params.threads.value[1],
+                                     params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(params.threads.value[2] * params.teams.value[2],
-				    params.threads.value[1] * params.teams.value[1],
-				    params.threads.value[0] * params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        params.threads.value[2] * params.teams.value[2],
+        params.threads.value[1] * params.teams.value[1],
+        params.threads.value[0] * params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( params.threads.value[0]  > zero && params.threads.value[1]  > zero && params.threads.value[2] > zero &&
-         params.teams.value[0] > zero && params.teams.value[1] > zero && params.teams.value[2]> zero ) {
+    if (params.threads.value[0] > zero && params.threads.value[1] > zero &&
+        params.threads.value[2] > zero && params.teams.value[0] > zero &&
+        params.teams.value[1] > zero && params.teams.value[2] > zero)
+    {
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(params.shared_mem_size, h);
 
-        auto s_vec = ::sycl::local_accessor<char, 1> (params.shared_mem_size, h);
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize),
+             [=](cl::sycl::nd_item<3> itm) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           [=] (cl::sycl::nd_item<3> itm) {
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
-
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
-
-            (*lbody)(ctx);
-
-           });
-
-      }).wait(); // Need to wait for completion to free memory
+               (*lbody)(ctx);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
       cl::sycl::free(lbody, *q);
 
       RAJA_FT_END;
-
     }
 
     return resources::EventProxy<resources::Resource>(res);
   }
 
-
-  //If the launch lambda is not trivially copyable
-  template <typename BODY_IN, typename ReduceParams,
-	    typename std::enable_if<!std::is_trivially_copyable<BODY_IN>{},bool>::type = true>
-    static concepts::enable_if_t<resources::EventProxy<resources::Resource>,
-                                 RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
-                                 concepts::negate<RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
-    exec(RAJA::resources::Resource res, const LaunchParams &launch_params, const char *kernel_name,
-         BODY_IN &&body_in, ReduceParams launch_reducers)
+  // If the launch lambda is not trivially copyable
+  template<typename BODY_IN,
+           typename ReduceParams,
+           typename std::enable_if<!std::is_trivially_copyable<BODY_IN> {},
+                                   bool>::type = true>
+  static concepts::enable_if_t<
+      resources::EventProxy<resources::Resource>,
+      RAJA::expt::type_traits::is_ForallParamPack<ReduceParams>,
+      concepts::negate<
+          RAJA::expt::type_traits::is_ForallParamPack_empty<ReduceParams>>>
+  exec(RAJA::resources::Resource res,
+       const LaunchParams& launch_params,
+       const char* kernel_name,
+       BODY_IN&& body_in,
+       ReduceParams launch_reducers)
   {
 
     /*Get the queue from concrete resource */
@@ -264,66 +296,69 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
     // Compute the number of blocks and threads
     //
     const ::sycl::range<3> blockSize(launch_params.threads.value[2],
-				     launch_params.threads.value[1],
-				     launch_params.threads.value[0]);
+                                     launch_params.threads.value[1],
+                                     launch_params.threads.value[0]);
 
-    const ::sycl::range<3> gridSize(launch_params.threads.value[2] * launch_params.teams.value[2],
-				    launch_params.threads.value[1] * launch_params.teams.value[1],
-				    launch_params.threads.value[0] * launch_params.teams.value[0]);
+    const ::sycl::range<3> gridSize(
+        launch_params.threads.value[2] * launch_params.teams.value[2],
+        launch_params.threads.value[1] * launch_params.teams.value[1],
+        launch_params.threads.value[0] * launch_params.teams.value[0]);
 
     // Only launch kernel if we have something to iterate over
     constexpr size_t zero = 0;
-    if ( launch_params.threads.value[0]  > zero && launch_params.threads.value[1]  > zero && launch_params.threads.value[2] > zero &&
-         launch_params.teams.value[0] > zero && launch_params.teams.value[1] > zero && launch_params.teams.value[2]> zero ) {
+    if (launch_params.threads.value[0] > zero &&
+        launch_params.threads.value[1] > zero &&
+        launch_params.threads.value[2] > zero &&
+        launch_params.teams.value[0] > zero &&
+        launch_params.teams.value[1] > zero &&
+        launch_params.teams.value[2] > zero)
+    {
 
 
-      auto combiner = []( ReduceParams x, ReduceParams y ) {
-        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( x, y );
+      auto combiner = [](ReduceParams x, ReduceParams y) {
+        RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(x, y);
         return x;
-       };
+      };
 
       RAJA_FT_BEGIN;
 
       //
-      // Kernel body is nontrivially copyable, create space on device and copy to
-      // Workaround until "is_device_copyable" is supported
+      // Kernel body is nontrivially copyable, create space on device and copy
+      // to Workaround until "is_device_copyable" is supported
       //
       using LOOP_BODY = camp::decay<BODY_IN>;
       LOOP_BODY* lbody;
-      lbody = (LOOP_BODY*) cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
+      lbody = (LOOP_BODY*)cl::sycl::malloc_device(sizeof(LOOP_BODY), *q);
       q->memcpy(lbody, &body_in, sizeof(LOOP_BODY)).wait();
 
-      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1,*q);
+      ReduceParams* res = ::sycl::malloc_shared<ReduceParams>(1, *q);
       RAJA::expt::ParamMultiplexer::init<EXEC_POL>(*res);
       auto reduction = ::sycl::reduction(res, launch_reducers, combiner);
 
       q->submit([&](cl::sycl::handler& h) {
+         auto s_vec =
+             ::sycl::local_accessor<char, 1>(launch_params.shared_mem_size, h);
 
-       auto s_vec = ::sycl::local_accessor<char, 1> (launch_params.shared_mem_size, h);
-
-        h.parallel_for
-          (cl::sycl::nd_range<3>(gridSize, blockSize),
-           reduction,
-           [=] (cl::sycl::nd_item<3> itm, auto & red) {
+         h.parallel_for(
+             cl::sycl::nd_range<3>(gridSize, blockSize), reduction,
+             [=](cl::sycl::nd_item<3> itm, auto& red) {
+               LaunchContext ctx;
+               ctx.itm = &itm;
 
-            LaunchContext ctx;
-            ctx.itm = &itm;
+               // Point to shared memory
+               ctx.shared_mem_ptr =
+                   s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
 
-            //Point to shared memory
-            ctx.shared_mem_ptr = s_vec.get_multi_ptr<::sycl::access::decorated::yes>().get();
+               ReduceParams fp;
+               RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
 
-            ReduceParams fp;
-            RAJA::expt::ParamMultiplexer::init<EXEC_POL>(fp);
+               RAJA::expt::invoke_body(fp, *lbody, ctx);
 
-            RAJA::expt::invoke_body(fp, *lbody, ctx);
+               red.combine(fp);
+             });
+       }).wait();  // Need to wait for completion to free memory
 
-            red.combine(fp);
-
-           });
-
-      }).wait(); // Need to wait for completion to free memory
-
-      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>( launch_reducers, *res );
+      RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(launch_reducers, *res);
       ::sycl::free(res, *q);
       cl::sycl::free(lbody, *q);
 
@@ -332,69 +367,65 @@ struct LaunchExecute<RAJA::sycl_launch_t<async, 0>> {
 
     RAJA::expt::ParamMultiplexer::resolve<EXEC_POL>(launch_reducers);
 
-   return resources::EventProxy<resources::Resource>(res);
+    return resources::EventProxy<resources::Resource>(res);
   }
-
 };
 
 /*
    SYCL global thread mapping
 */
-template<int ... DIM>
+template<int... DIM>
 struct sycl_global_item;
 
 using sycl_global_item_0 = sycl_global_item<0>;
 using sycl_global_item_1 = sycl_global_item<1>;
 using sycl_global_item_2 = sycl_global_item<2>;
 
-template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_global_item<DIM>, SEGMENT> {
+template<typename SEGMENT, int DIM>
+struct LoopExecute<sycl_global_item<DIM>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
-        ctx.itm->get_local_id(DIM);
+      const int tx = ctx.itm->get_group(DIM) * ctx.itm->get_local_range(DIM) +
+                     ctx.itm->get_local_id(DIM);
 
       if (tx < len) body(*(segment.begin() + tx));
     }
   }
 };
 
-using sycl_global_item_01 = sycl_global_item<0,1>;
-using sycl_global_item_02 = sycl_global_item<0,2>;
-using sycl_global_item_10 = sycl_global_item<1,0>;
-using sycl_global_item_12 = sycl_global_item<1,2>;
-using sycl_global_item_20 = sycl_global_item<2,0>;
-using sycl_global_item_21 = sycl_global_item<2,1>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+using sycl_global_item_01 = sycl_global_item<0, 1>;
+using sycl_global_item_02 = sycl_global_item<0, 2>;
+using sycl_global_item_10 = sycl_global_item<1, 0>;
+using sycl_global_item_12 = sycl_global_item<1, 2>;
+using sycl_global_item_20 = sycl_global_item<2, 0>;
+using sycl_global_item_21 = sycl_global_item<2, 1>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
 
       if (tx < len0 && ty < len1)
@@ -403,44 +434,39 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1>, SEGMENT> {
   }
 };
 
+using sycl_global_item_012 = sycl_global_item<0, 1, 2>;
+using sycl_global_item_021 = sycl_global_item<0, 2, 1>;
+using sycl_global_item_102 = sycl_global_item<1, 0, 2>;
+using sycl_global_item_120 = sycl_global_item<1, 2, 0>;
+using sycl_global_item_201 = sycl_global_item<2, 0, 1>;
+using sycl_global_item_210 = sycl_global_item<2, 1, 0>;
 
-using sycl_global_item_012 = sycl_global_item<0,1,2>;
-using sycl_global_item_021 = sycl_global_item<0,2,1>;
-using sycl_global_item_102 = sycl_global_item<1,0,2>;
-using sycl_global_item_120 = sycl_global_item<1,2,0>;
-using sycl_global_item_201 = sycl_global_item<2,0,1>;
-using sycl_global_item_210 = sycl_global_item<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =
-        ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
-        ctx.itm->get_local_id(DIM0);
+      const int tx = ctx.itm->get_group(DIM0) * ctx.itm->get_local_range(DIM0) +
+                     ctx.itm->get_local_id(DIM0);
 
-      const int ty =
-        ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
-        ctx.itm->get_local_id(DIM1);
+      const int ty = ctx.itm->get_group(DIM1) * ctx.itm->get_local_range(DIM1) +
+                     ctx.itm->get_local_id(DIM1);
 
-      const int tz =
-        ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
-        ctx.itm->get_local_id(DIM2);
+      const int tz = ctx.itm->get_group(DIM2) * ctx.itm->get_local_range(DIM2) +
+                     ctx.itm->get_local_id(DIM2);
 
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment1.begin() + ty));
     }
   }
@@ -449,56 +475,75 @@ struct LoopExecute<sycl_global_item<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
 Reshape threads in a block into a 1D iteration space
 */
-template<int ... dim>
-struct sycl_flatten_group_local_direct{};
-
-using sycl_flatten_group_local_01_direct = sycl_flatten_group_local_direct<0,1>;
-using sycl_flatten_group_local_02_direct = sycl_flatten_group_local_direct<0,2>;
-using sycl_flatten_group_local_10_direct = sycl_flatten_group_local_direct<1,0>;
-using sycl_flatten_group_local_12_direct = sycl_flatten_group_local_direct<1,2>;
-using sycl_flatten_group_local_20_direct = sycl_flatten_group_local_direct<2,0>;
-using sycl_flatten_group_local_21_direct = sycl_flatten_group_local_direct<2,1>;
-
-using sycl_flatten_group_local_012_direct = sycl_flatten_group_local_direct<0,1,2>;
-using sycl_flatten_group_local_021_direct = sycl_flatten_group_local_direct<0,2,1>;
-using sycl_flatten_group_local_102_direct = sycl_flatten_group_local_direct<1,0,2>;
-using sycl_flatten_group_local_120_direct = sycl_flatten_group_local_direct<1,2,0>;
-using sycl_flatten_group_local_201_direct = sycl_flatten_group_local_direct<2,0,1>;
-using sycl_flatten_group_local_210_direct = sycl_flatten_group_local_direct<2,1,0>;
-
-template<int ... dim>
-struct sycl_flatten_group_local_loop{};
-
-using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0,1>;
-using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0,2>;
-using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1,0>;
-using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1,2>;
-using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2,0>;
-using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2,1>;
-
-using sycl_flatten_group_local_012_loop = sycl_flatten_group_local_loop<0,1,2>;
-using sycl_flatten_group_local_021_loop = sycl_flatten_group_local_loop<0,2,1>;
-using sycl_flatten_group_local_102_loop = sycl_flatten_group_local_loop<1,0,2>;
-using sycl_flatten_group_local_120_loop = sycl_flatten_group_local_loop<1,2,0>;
-using sycl_flatten_group_local_201_loop = sycl_flatten_group_local_loop<2,0,1>;
-using sycl_flatten_group_local_210_loop = sycl_flatten_group_local_loop<2,1,0>;
+template<int... dim>
+struct sycl_flatten_group_local_direct
+{};
+
+using sycl_flatten_group_local_01_direct =
+    sycl_flatten_group_local_direct<0, 1>;
+using sycl_flatten_group_local_02_direct =
+    sycl_flatten_group_local_direct<0, 2>;
+using sycl_flatten_group_local_10_direct =
+    sycl_flatten_group_local_direct<1, 0>;
+using sycl_flatten_group_local_12_direct =
+    sycl_flatten_group_local_direct<1, 2>;
+using sycl_flatten_group_local_20_direct =
+    sycl_flatten_group_local_direct<2, 0>;
+using sycl_flatten_group_local_21_direct =
+    sycl_flatten_group_local_direct<2, 1>;
+
+using sycl_flatten_group_local_012_direct =
+    sycl_flatten_group_local_direct<0, 1, 2>;
+using sycl_flatten_group_local_021_direct =
+    sycl_flatten_group_local_direct<0, 2, 1>;
+using sycl_flatten_group_local_102_direct =
+    sycl_flatten_group_local_direct<1, 0, 2>;
+using sycl_flatten_group_local_120_direct =
+    sycl_flatten_group_local_direct<1, 2, 0>;
+using sycl_flatten_group_local_201_direct =
+    sycl_flatten_group_local_direct<2, 0, 1>;
+using sycl_flatten_group_local_210_direct =
+    sycl_flatten_group_local_direct<2, 1, 0>;
+
+template<int... dim>
+struct sycl_flatten_group_local_loop
+{};
+
+using sycl_flatten_group_local_01_loop = sycl_flatten_group_local_loop<0, 1>;
+using sycl_flatten_group_local_02_loop = sycl_flatten_group_local_loop<0, 2>;
+using sycl_flatten_group_local_10_loop = sycl_flatten_group_local_loop<1, 0>;
+using sycl_flatten_group_local_12_loop = sycl_flatten_group_local_loop<1, 2>;
+using sycl_flatten_group_local_20_loop = sycl_flatten_group_local_loop<2, 0>;
+using sycl_flatten_group_local_21_loop = sycl_flatten_group_local_loop<2, 1>;
+
+using sycl_flatten_group_local_012_loop =
+    sycl_flatten_group_local_loop<0, 1, 2>;
+using sycl_flatten_group_local_021_loop =
+    sycl_flatten_group_local_loop<0, 2, 1>;
+using sycl_flatten_group_local_102_loop =
+    sycl_flatten_group_local_loop<1, 0, 2>;
+using sycl_flatten_group_local_120_loop =
+    sycl_flatten_group_local_loop<1, 2, 0>;
+using sycl_flatten_group_local_201_loop =
+    sycl_flatten_group_local_loop<2, 0, 1>;
+using sycl_flatten_group_local_210_loop =
+    sycl_flatten_group_local_loop<2, 1, 0>;
 
 template<typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1>, SEGMENT>
 {
   template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
     {
-      const int tx = ctx.itm->get_local_id(DIM0);
-      const int ty = ctx.itm->get_local_id(DIM1);
-      const int bx = ctx.itm->get_local_range(DIM0);
-      const int tid = tx + bx*ty;
+      const int tx  = ctx.itm->get_local_id(DIM0);
+      const int ty  = ctx.itm->get_local_id(DIM1);
+      const int bx  = ctx.itm->get_local_range(DIM0);
+      const int tid = tx + bx * ty;
 
       if (tid < len) body(*(segment.begin() + tid));
     }
@@ -509,10 +554,9 @@ template<typename SEGMENT, int DIM0, int DIM1>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
 {
   template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -522,10 +566,10 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1>, SEGMENT>
     const int bx = ctx.itm->get_local_range(DIM0);
     const int by = ctx.itm->get_local_range(DIM1);
 
-    for(int tid = tx + bx*ty; tid < len; tid += bx*by) {
+    for (int tid = tx + bx * ty; tid < len; tid += bx * by)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
@@ -533,10 +577,9 @@ template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
 {
   template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
     const int len = segment.end() - segment.begin();
     {
@@ -546,7 +589,7 @@ struct LoopExecute<sycl_flatten_group_local_direct<DIM0, DIM1, DIM2>, SEGMENT>
       const int bx = ctx.itm->get_local_range(DIM0);
       const int by = ctx.itm->get_local_range(DIM1);
 
-      const int tid = tx + bx*(ty + by*tz);
+      const int tid = tx + bx * (ty + by * tz);
 
       if (tid < len) body(*(segment.begin() + tid));
     }
@@ -557,10 +600,9 @@ template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
 struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
 {
   template<typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
     const int len = segment.end() - segment.begin();
 
@@ -571,30 +613,29 @@ struct LoopExecute<sycl_flatten_group_local_loop<DIM0, DIM1, DIM2>, SEGMENT>
     const int by = ctx.itm->get_local_range(DIM1);
     const int bz = ctx.itm->get_local_range(DIM2);
 
-    for(int tid = tx + bx*(ty + by*tz); tid < len; tid += bx*by*bz) {
+    for (int tid = tx + bx * (ty + by * tz); tid < len; tid += bx * by * bz)
+    {
       body(*(segment.begin() + tid));
     }
-
   }
 };
 
 /*
   SYCL thread loops with block strides
 */
-template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
          tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx));
@@ -605,14 +646,14 @@ struct LoopExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 /*
   SYCL thread direct mappings
 */
-template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -626,21 +667,21 @@ struct LoopExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 /*
   SYCL block loops with grid strides
 */
-template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx));
     }
   }
@@ -649,14 +690,14 @@ struct LoopExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 /*
   SYCL block direct mappings
 */
-template <typename SEGMENT, int DIM>
-struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -670,21 +711,20 @@ struct LoopExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 /*
   SYCL thread loops with block strides + Return Index
 */
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM);
-         tx < len;
-         tx += ctx.itm->get_local_range(DIM) )
+    for (int tx = ctx.itm->get_local_id(DIM); tx < len;
+         tx += ctx.itm->get_local_range(DIM))
     {
       body(*(segment.begin() + tx), tx);
     }
@@ -694,14 +734,14 @@ struct LoopICountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
 /*
   SYCL thread direct mappings
 */
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -715,21 +755,21 @@ struct LoopICountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
 /*
   SYCL block loops with grid strides
 */
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx =  ctx.itm->get_group(DIM);
-         bx < len;
-         bx += ctx.itm->get_group_range(DIM) ) {
+    for (int bx = ctx.itm->get_group(DIM); bx < len;
+         bx += ctx.itm->get_group_range(DIM))
+    {
       body(*(segment.begin() + bx), bx);
     }
   }
@@ -738,14 +778,14 @@ struct LoopICountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
 /*
   SYCL block direct mappings
 */
-template <typename SEGMENT, int DIM>
-struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment,
-      BODY const &body)
+template<typename SEGMENT, int DIM>
+struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
@@ -757,29 +797,29 @@ struct LoopICountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
 };
 
 // perfectly nested sycl direct policies
-using sycl_group_01_nested_direct = sycl_group_012_direct<0,1>;
-using sycl_group_02_nested_direct = sycl_group_012_direct<0,2>;
-using sycl_group_10_nested_direct = sycl_group_012_direct<1,0>;
-using sycl_group_12_nested_direct = sycl_group_012_direct<1,2>;
-using sycl_group_20_nested_direct = sycl_group_012_direct<2,0>;
-using sycl_group_21_nested_direct = sycl_group_012_direct<2,1>;
-
-using sycl_group_012_nested_direct = sycl_group_012_direct<0,1,2>;
-using sycl_group_021_nested_direct = sycl_group_012_direct<0,2,1>;
-using sycl_group_102_nested_direct = sycl_group_012_direct<1,0,2>;
-using sycl_group_120_nested_direct = sycl_group_012_direct<1,2,0>;
-using sycl_group_201_nested_direct = sycl_group_012_direct<2,0,1>;
-using sycl_group_210_nested_direct = sycl_group_012_direct<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+using sycl_group_01_nested_direct = sycl_group_012_direct<0, 1>;
+using sycl_group_02_nested_direct = sycl_group_012_direct<0, 2>;
+using sycl_group_10_nested_direct = sycl_group_012_direct<1, 0>;
+using sycl_group_12_nested_direct = sycl_group_012_direct<1, 2>;
+using sycl_group_20_nested_direct = sycl_group_012_direct<2, 0>;
+using sycl_group_21_nested_direct = sycl_group_012_direct<2, 1>;
+
+using sycl_group_012_nested_direct = sycl_group_012_direct<0, 1, 2>;
+using sycl_group_021_nested_direct = sycl_group_012_direct<0, 2, 1>;
+using sycl_group_102_nested_direct = sycl_group_012_direct<1, 0, 2>;
+using sycl_group_120_nested_direct = sycl_group_012_direct<1, 2, 0>;
+using sycl_group_201_nested_direct = sycl_group_012_direct<2, 0, 1>;
+using sycl_group_210_nested_direct = sycl_group_012_direct<2, 1, 0>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
@@ -792,16 +832,16 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -811,8 +851,7 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz));
     }
   }
@@ -822,38 +861,37 @@ struct LoopExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
   Perfectly nested sycl direct policies
   Return local index
 */
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
-      const int tx =  ctx.itm->get_group(DIM0);
-      const int ty =  ctx.itm->get_group(DIM1);
+      const int tx = ctx.itm->get_group(DIM0);
+      const int ty = ctx.itm->get_group(DIM1);
       if (tx < len0 && ty < len1)
-        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
-             tx, ty);
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty), tx, ty);
     }
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
@@ -863,48 +901,45 @@ struct LoopICountExecute<sycl_group_012_direct<DIM0, DIM1, DIM2>, SEGMENT> {
       const int ty = ctx.itm->get_group(DIM1);
       const int tz = ctx.itm->get_group(DIM2);
       if (tx < len0 && ty < len1 && tz < len2)
-        body(*(segment0.begin() + tx),
-             *(segment1.begin() + ty),
+        body(*(segment0.begin() + tx), *(segment1.begin() + ty),
              *(segment2.begin() + tz), tx, ty, tz);
     }
   }
 };
 
 // perfectly nested sycl loop policies
-using sycl_group_01_nested_loop = sycl_group_012_loop<0,1>;
-using sycl_group_02_nested_loop = sycl_group_012_loop<0,2>;
-using sycl_group_10_nested_loop = sycl_group_012_loop<1,0>;
-using sycl_group_12_nested_loop = sycl_group_012_loop<1,2>;
-using sycl_group_20_nested_loop = sycl_group_012_loop<2,0>;
-using sycl_group_21_nested_loop = sycl_group_012_loop<2,1>;
-
-using sycl_group_012_nested_loop = sycl_group_012_loop<0,1,2>;
-using sycl_group_021_nested_loop = sycl_group_012_loop<0,2,1>;
-using sycl_group_102_nested_loop = sycl_group_012_loop<1,0,2>;
-using sycl_group_120_nested_loop = sycl_group_012_loop<1,2,0>;
-using sycl_group_201_nested_loop = sycl_group_012_loop<2,0,1>;
-using sycl_group_210_nested_loop = sycl_group_012_loop<2,1,0>;
-
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+using sycl_group_01_nested_loop = sycl_group_012_loop<0, 1>;
+using sycl_group_02_nested_loop = sycl_group_012_loop<0, 2>;
+using sycl_group_10_nested_loop = sycl_group_012_loop<1, 0>;
+using sycl_group_12_nested_loop = sycl_group_012_loop<1, 2>;
+using sycl_group_20_nested_loop = sycl_group_012_loop<2, 0>;
+using sycl_group_21_nested_loop = sycl_group_012_loop<2, 1>;
+
+using sycl_group_012_nested_loop = sycl_group_012_loop<0, 1, 2>;
+using sycl_group_021_nested_loop = sycl_group_012_loop<0, 2, 1>;
+using sycl_group_102_nested_loop = sycl_group_012_loop<1, 0, 2>;
+using sycl_group_120_nested_loop = sycl_group_012_loop<1, 2, 0>;
+using sycl_group_201_nested_loop = sycl_group_012_loop<2, 0, 1>;
+using sycl_group_210_nested_loop = sycl_group_012_loop<2, 1, 0>;
+
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM1);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM1); by < len1;
              bx += ctx.itm->get_group_range(DIM1))
         {
           body(*(segment0.begin() + bx), *(segment1.begin() + by));
@@ -914,38 +949,34 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM1);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM1); by < len1;
            by += ctx.itm->get_group_range(DIM1))
       {
 
-        for (int bz = ctx.itm->get_group(DIM2);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM2); bz < len2;
              bz += ctx.itm->get_group_range(DIM2))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz));
         }
       }
@@ -956,26 +987,24 @@ struct LoopExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
 /*
   perfectly nested sycl loop policies + returns local index
 */
-template <typename SEGMENT, int DIM0, int DIM1>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
-
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      BODY const &body)
+template<typename SEGMENT, int DIM0, int DIM1>
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           BODY const& body)
   {
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
     {
 
-      for (int bx = ctx.itm->get_group(DIM0);
-           bx < len0;
+      for (int bx = ctx.itm->get_group(DIM0); bx < len0;
            bx += ctx.itm->get_group_range(DIM0))
       {
-        for (int by = ctx.itm->get_group(DIM0);
-             by < len1;
+        for (int by = ctx.itm->get_group(DIM0); by < len1;
              by += ctx.itm->get_group_range(DIM1))
         {
 
@@ -986,38 +1015,34 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1>, SEGMENT> {
   }
 };
 
-template <typename SEGMENT, int DIM0, int DIM1, int DIM2>
-struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
+template<typename SEGMENT, int DIM0, int DIM1, int DIM2>
+struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT>
+{
 
-  template <typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      SEGMENT const &segment0,
-      SEGMENT const &segment1,
-      SEGMENT const &segment2,
-      BODY const &body)
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment0,
+                                           SEGMENT const& segment1,
+                                           SEGMENT const& segment2,
+                                           BODY const& body)
   {
     const int len2 = segment2.end() - segment2.begin();
     const int len1 = segment1.end() - segment1.begin();
     const int len0 = segment0.end() - segment0.begin();
 
-    for (int bx = ctx.itm->get_group(DIM0);
-         bx < len0;
+    for (int bx = ctx.itm->get_group(DIM0); bx < len0;
          bx += ctx.itm->get_group_range(DIM0))
     {
 
-      for (int by = ctx.itm->get_group(DIM0);
-           by < len1;
+      for (int by = ctx.itm->get_group(DIM0); by < len1;
            by += ctx.itm->get_group_range(DIM0))
       {
 
-        for (int bz =  ctx.itm->get_group(DIM0);
-             bz < len2;
+        for (int bz = ctx.itm->get_group(DIM0); bz < len2;
              bz += ctx.itm->get_group_range(DIM0))
         {
 
-          body(*(segment0.begin() + bx),
-               *(segment1.begin() + by),
+          body(*(segment0.begin() + bx), *(segment1.begin() + by),
                *(segment2.begin() + bz), bx, by, bz);
         }
       }
@@ -1025,21 +1050,20 @@ struct LoopICountExecute<sycl_group_012_loop<DIM0, DIM1, DIM2>, SEGMENT> {
   }
 };
 
-template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+template<typename SEGMENT, int DIM>
+struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
       body(segment.slice(tx, tile_size));
@@ -1047,43 +1071,41 @@ struct TileExecute<sycl_local_012_loop<DIM>, SEGMENT> {
   }
 };
 
+template<typename SEGMENT, int DIM>
+struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
-template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_local_012_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
+template<typename SEGMENT, int DIM>
+struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
-template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_group(DIM)* tile_size;
+    for (int tx = ctx.itm->get_group(DIM) * tile_size;
 
          tx < len;
 
@@ -1094,111 +1116,108 @@ struct TileExecute<sycl_group_012_loop<DIM>, SEGMENT> {
   }
 };
 
-template <typename SEGMENT, int DIM>
-struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT> {
+template<typename SEGMENT, int DIM>
+struct TileExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_group(DIM) * tile_size;
-    if(tx < len){
+    if (tx < len)
+    {
       body(segment.slice(tx, tile_size));
     }
   }
 };
 
-//Tile execute + return index
-template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT> {
+// Tile execute + return index
+template<typename SEGMENT, int DIM>
+struct TileTCountExecute<sycl_local_012_loop<DIM>, SEGMENT>
+{
 
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int tx = ctx.itm->get_local_id(DIM) * tile_size;
-         tx < len;
+    for (int tx = ctx.itm->get_local_id(DIM) * tile_size; tx < len;
          tx += ctx.itm->get_local_range(DIM) * tile_size)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
+template<typename SEGMENT, int DIM>
+struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT>
+{
 
-template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_local_012_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int tx = ctx.itm->get_local_id(DIM) * tile_size;
-    if(tx < len)
+    if (tx < len)
     {
-      body(segment.slice(tx, tile_size), tx/tile_size);
+      body(segment.slice(tx, tile_size), tx / tile_size);
     }
   }
 };
 
+template<typename SEGMENT, int DIM>
+struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT>
+{
 
-template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_loop<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
-    for (int bx = ctx.itm->get_group(DIM) * tile_size;
-         bx < len;
+    for (int bx = ctx.itm->get_group(DIM) * tile_size; bx < len;
          bx += ctx.itm->get_group_range(DIM) * tile_size)
     {
-      body(segment.slice(bx, tile_size), bx/tile_size);
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
 
+template<typename SEGMENT, int DIM>
+struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT>
+{
 
-template <typename SEGMENT, int DIM>
-struct TileTCountExecute<sycl_group_012_direct<DIM>, SEGMENT> {
-
-  template <typename TILE_T, typename BODY>
-  static RAJA_INLINE RAJA_DEVICE void exec(
-      LaunchContext const &ctx,
-      TILE_T tile_size,
-      SEGMENT const &segment,
-      BODY const &body)
+  template<typename TILE_T, typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           TILE_T tile_size,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
   {
 
     const int len = segment.end() - segment.begin();
 
     int bx = ctx.itm->get_group(DIM) * tile_size;
-    if(bx < len){
-      body(segment.slice(bx, tile_size), bx/tile_size);
+    if (bx < len)
+    {
+      body(segment.slice(bx, tile_size), bx / tile_size);
     }
   }
 };
diff --git a/include/RAJA/policy/sycl/params/kernel_name.hpp b/include/RAJA/policy/sycl/params/kernel_name.hpp
index 1f33be19bb..770d3d1cc2 100644
--- a/include/RAJA/policy/sycl/params/kernel_name.hpp
+++ b/include/RAJA/policy/sycl/params/kernel_name.hpp
@@ -3,39 +3,42 @@
 
 #include "RAJA/pattern/params/kernel_name.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
-
-#if defined(RAJA_ENABLE_SYCL)  
-  
-  // Init
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename T>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  SYCL_EXTERNAL
-  combine(KernelName&, T) {}
-
-  // Resolve
-  template<typename EXEC_POL>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(KernelName&)
-  {
-    //TODO: Define kernel naming
-  }
-
-#endif  
-
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
-
-
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
+
+#if defined(RAJA_ENABLE_SYCL)
+
+// Init
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> init(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+// Combine
+template<typename EXEC_POL, typename T>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> SYCL_EXTERNAL
+combine(KernelName&, T)
+{}
+
+// Resolve
+template<typename EXEC_POL>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> resolve(
+    KernelName&)
+{
+  // TODO: Define kernel naming
+}
+
+#endif
+
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
+
+
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/params/reduce.hpp b/include/RAJA/policy/sycl/params/reduce.hpp
index e2fb7e1a5a..1679b43c3c 100644
--- a/include/RAJA/policy/sycl/params/reduce.hpp
+++ b/include/RAJA/policy/sycl/params/reduce.hpp
@@ -3,37 +3,44 @@
 
 #include "RAJA/pattern/params/reducer.hpp"
 
-namespace RAJA {
-namespace expt {
-namespace detail {
+namespace RAJA
+{
+namespace expt
+{
+namespace detail
+{
 
 #if defined(RAJA_ENABLE_SYCL)
 
-  // Init
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  init(Reducer<OP, T, VOp>& red) {
-    red.m_valop.val = OP::identity();
-  }
-
-  // Combine
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  combine(Reducer<OP, T, VOp>& out, const Reducer<OP, T, VOp>& in) {
-    out.m_valop.val = OP{}(out.m_valop.val, in.m_valop.val);
-  }
-
-  // Resolve
-  template<typename EXEC_POL, typename OP, typename T, typename VOp>
-  camp::concepts::enable_if< type_traits::is_sycl_policy<EXEC_POL> >
-  resolve(Reducer<OP, T, VOp>& red) {
-    red.combineTarget(red.m_valop.val);
-  }
+// Init
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> init(
+    Reducer<OP, T, VOp>& red)
+{
+  red.m_valop.val = OP::identity();
+}
+
+// Combine
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> combine(
+    Reducer<OP, T, VOp>& out,
+    const Reducer<OP, T, VOp>& in)
+{
+  out.m_valop.val = OP {}(out.m_valop.val, in.m_valop.val);
+}
+
+// Resolve
+template<typename EXEC_POL, typename OP, typename T, typename VOp>
+camp::concepts::enable_if<type_traits::is_sycl_policy<EXEC_POL>> resolve(
+    Reducer<OP, T, VOp>& red)
+{
+  red.combineTarget(red.m_valop.val);
+}
 
 #endif
 
-} //  namespace detail
-} //  namespace expt
-} //  namespace RAJA
+}  //  namespace detail
+}  //  namespace expt
+}  //  namespace RAJA
 
-#endif //  NEW_REDUCE_SYCL_REDUCE_HPP
+#endif  //  NEW_REDUCE_SYCL_REDUCE_HPP
diff --git a/include/RAJA/policy/sycl/policy.hpp b/include/RAJA/policy/sycl/policy.hpp
index 0f92fe27e1..13ca35c07c 100644
--- a/include/RAJA/policy/sycl/policy.hpp
+++ b/include/RAJA/policy/sycl/policy.hpp
@@ -35,7 +35,8 @@
 namespace RAJA
 {
 
-struct uint3 {
+struct uint3
+{
   unsigned long x, y, z;
 };
 
@@ -45,13 +46,15 @@ using sycl_dim_3_t = uint3;
 
 namespace detail
 {
-template <bool Async>
-struct get_launch {
+template<bool Async>
+struct get_launch
+{
   static constexpr RAJA::Launch value = RAJA::Launch::async;
 };
 
-template <>
-struct get_launch<false> {
+template<>
+struct get_launch<false>
+{
   static constexpr RAJA::Launch value = RAJA::Launch::sync;
 };
 }  // end namespace detail
@@ -68,32 +71,33 @@ namespace sycl
 //
 //////////////////////////////////////////////////////////////////////
 
-template <size_t BLOCK_SIZE, bool Async = false>
+template<size_t BLOCK_SIZE, bool Async = false>
 struct sycl_exec : public RAJA::make_policy_pattern_launch_platform_t<
                        RAJA::Policy::sycl,
                        RAJA::Pattern::forall,
                        detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                       RAJA::Platform::sycl>
+{};
 
-template <bool Async, int num_threads = 0>
+template<bool Async, int num_threads = 0>
 struct sycl_launch_t : public RAJA::make_policy_pattern_launch_platform_t<
-                       RAJA::Policy::sycl,
-                       RAJA::Pattern::region,
-                       detail::get_launch<Async>::value,
-                       RAJA::Platform::sycl> {
-};
+                           RAJA::Policy::sycl,
+                           RAJA::Pattern::region,
+                           detail::get_launch<Async>::value,
+                           RAJA::Platform::sycl>
+{};
 
 struct sycl_reduce
-    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce> {
-};
+    : make_policy_pattern_t<RAJA::Policy::sycl, RAJA::Pattern::reduce>
+{};
 
 //
 // Sycl atomic policy for using sycl atomics on the device and
 // the provided Policy on the host
 //
 template<typename host_policy>
-struct sycl_atomic_explicit{};
+struct sycl_atomic_explicit
+{};
 
 //
 // Default sycl atomic policy uses sycl atomics on the device and non-atomics
@@ -102,10 +106,12 @@ struct sycl_atomic_explicit{};
 using sycl_atomic = sycl_atomic_explicit<seq_atomic>;
 
 template<typename Mask>
-struct sycl_local_masked_direct {};
+struct sycl_local_masked_direct
+{};
 
 template<typename Mask>
-struct sycl_local_masked_loop {};
+struct sycl_local_masked_loop
+{};
 
 }  // namespace sycl
 }  // namespace policy
@@ -120,13 +126,14 @@ using policy::sycl::sycl_local_masked_direct;
 using policy::sycl::sycl_local_masked_loop;
 
 using policy::sycl::sycl_launch_t;
-  
+
 /*!
  * Maps indices to SYCL global id
  * Optional WORK_GROUP_SIZE to
  */
 template<int dim, int WORK_GROUP_SIZE = 1>
-struct sycl_global_012{};
+struct sycl_global_012
+{};
 
 template<int WORK_GROUP_SIZE>
 using sycl_global_0 = sycl_global_012<0, WORK_GROUP_SIZE>;
@@ -139,8 +146,9 @@ using sycl_global_2 = sycl_global_012<2, WORK_GROUP_SIZE>;
  * Maps segment indices to SYCL group ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_group_012_loop{};
+template<int... dim>
+struct sycl_group_012_loop
+{};
 
 using sycl_group_0_loop = sycl_group_012_loop<0>;
 using sycl_group_1_loop = sycl_group_012_loop<1>;
@@ -150,8 +158,9 @@ using sycl_group_2_loop = sycl_group_012_loop<2>;
  * Maps segment indices to SYCL local ids.
  * Loops to allow for any value
  */
-template<int ... dim>
-struct sycl_local_012_loop{};
+template<int... dim>
+struct sycl_local_012_loop
+{};
 
 using sycl_local_0_loop = sycl_local_012_loop<0>;
 using sycl_local_1_loop = sycl_local_012_loop<1>;
@@ -160,8 +169,9 @@ using sycl_local_2_loop = sycl_local_012_loop<2>;
 /*!
  * Maps segment indices to SYCL group ids.
  */
-template<int ... dim>
-struct sycl_group_012_direct{};
+template<int... dim>
+struct sycl_group_012_direct
+{};
 
 using sycl_group_0_direct = sycl_group_012_direct<0>;
 using sycl_group_1_direct = sycl_group_012_direct<1>;
@@ -170,102 +180,86 @@ using sycl_group_2_direct = sycl_group_012_direct<2>;
 /*!
  * Maps segment indices to SYCL local ids.
  */
-template<int ... dim>
-struct sycl_local_012_direct{};
+template<int... dim>
+struct sycl_local_012_direct
+{};
 
 using sycl_local_0_direct = sycl_local_012_direct<0>;
 using sycl_local_1_direct = sycl_local_012_direct<1>;
 using sycl_local_2_direct = sycl_local_012_direct<2>;
 
-
-namespace internal{
+namespace internal
+{
 
 template<int dim>
 struct SyclDimHelper;
 
 template<>
-struct SyclDimHelper<0>{
+struct SyclDimHelper<0>
+{
 
   template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.x;
   }
 
   template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  inline static void set(dim_t& d, int value)
   {
     d.x = value;
   }
 };
 
 template<>
-struct SyclDimHelper<1>{
+struct SyclDimHelper<1>
+{
 
   template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.y;
   }
 
   template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  inline static void set(dim_t& d, int value)
   {
     d.y = value;
   }
 };
 
 template<>
-struct SyclDimHelper<2>{
+struct SyclDimHelper<2>
+{
 
   template<typename dim_t>
-  inline
-  static
-  constexpr
-  auto get(dim_t const &d) ->
-    decltype(d.x)
+  inline static constexpr auto get(dim_t const& d) -> decltype(d.x)
   {
     return d.z;
   }
 
   template<typename dim_t>
-  inline
-  static
-  void set(dim_t &d, int value)
+  inline static void set(dim_t& d, int value)
   {
     d.z = value;
   }
 };
 
 template<int dim, typename dim_t>
-constexpr
-auto get_sycl_dim(dim_t const &d) ->
-  decltype(d.x)
+constexpr auto get_sycl_dim(dim_t const& d) -> decltype(d.x)
 {
   return SyclDimHelper<dim>::get(d);
 }
 
 template<int dim, typename dim_t>
-void set_sycl_dim(dim_t &d, int value)
+void set_sycl_dim(dim_t& d, int value)
 {
   return SyclDimHelper<dim>::set(d, value);
 }
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
-#endif // RAJA_ENABLE_SYCL
+#endif  // RAJA_ENABLE_SYCL
 
 #endif
diff --git a/include/RAJA/policy/sycl/reduce.hpp b/include/RAJA/policy/sycl/reduce.hpp
index 49d89b3cd2..8074d71d5d 100644
--- a/include/RAJA/policy/sycl/reduce.hpp
+++ b/include/RAJA/policy/sycl/reduce.hpp
@@ -4,7 +4,7 @@
  * \file
  *
  * \brief   Header file for SYCL reduction stucts/classes.
- *          
+ *
  ******************************************************************************
  */
 
@@ -37,32 +37,36 @@ namespace RAJA
 namespace sycl
 {
 
-template <typename T, typename I>
-struct minloc 
+template<typename T, typename I>
+struct minloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::max());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
+
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
+                                               I& loc,
                                                const T v,
                                                const I l)
   {
-    if (v < val) {
+    if (v < val)
+    {
       loc = l;
       val = v;
     }
   }
 };
 
-template <typename T, typename I>
-struct maxloc 
+template<typename T, typename I>
+struct maxloc
 {
   static constexpr T identity = T(::RAJA::operators::limits<T>::min());
-  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T &val,
-                                               I &loc,
+
+  RAJA_HOST_DEVICE RAJA_INLINE void operator()(T& val,
+                                               I& loc,
                                                const T v,
                                                const I l)
   {
-    if (v > val) {
+    if (v > val)
+    {
       loc = l;
       val = v;
     }
@@ -74,28 +78,29 @@ struct maxloc
 static int MaxNumTeams = 1;
 
 //! Information necessary for SYCL offload to be considered
-struct Offload_Info 
+struct Offload_Info
 {
-  int hostID{1};
-  int deviceID{2};
-  bool isMapped{false};
+  int hostID {1};
+  int deviceID {2};
+  bool isMapped {false};
 
   Offload_Info() = default;
 
-  Offload_Info(const Offload_Info &other)
-      : hostID{other.hostID}, deviceID{other.deviceID}, isMapped{other.isMapped}
-  {
-  }
+  Offload_Info(const Offload_Info& other)
+      : hostID {other.hostID},
+        deviceID {other.deviceID},
+        isMapped {other.isMapped}
+  {}
 };
 
 //! Reduction data for SYCL Offload -- stores value, host pointer, and device
 //! pointer
-template <typename T>
+template<typename T>
 struct Reduce_Data
 {
   mutable T value;
-  T *device;
-  T *host;
+  T* device;
+  T* host;
 
   //! disallow default constructor
   Reduce_Data() = delete;
@@ -104,20 +109,24 @@ struct Reduce_Data
    *
    *  allocates data on the host and device and initializes values to default
    */
-  Reduce_Data(T initValue, T identityValue, Offload_Info &info)
+  Reduce_Data(T initValue, T identityValue, Offload_Info& info)
       : value(initValue)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
 
-    device = reinterpret_cast<T *>(cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
-    host = reinterpret_cast<T *>(cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
+    device = reinterpret_cast<T*>(
+        cl::sycl::malloc_device(sycl::MaxNumTeams * sizeof(T), *(q)));
+    host = reinterpret_cast<T*>(
+        cl::sycl::malloc_host(sycl::MaxNumTeams * sizeof(T), *(q)));
 
-    if (!host) {
+    if (!host)
+    {
       printf("Unable to allocate space on host\n");
       exit(1);
     }
-    if (!device) {
+    if (!device)
+    {
       printf("Unable to allocate space on device\n");
       exit(1);
     }
@@ -125,62 +134,63 @@ struct Reduce_Data
     hostToDevice(info);
   }
 
-  void reset(T initValue)
-  {
-    value = initValue;
-  }
+  void reset(T initValue) { value = initValue; }
 
   //! default copy constructor for POD
-  Reduce_Data(const Reduce_Data &) = default;
+  Reduce_Data(const Reduce_Data&) = default;
 
   //! transfers from the host to the device -- exit() is called upon failure
-  RAJA_INLINE void hostToDevice(Offload_Info &info)
+  RAJA_INLINE void hostToDevice(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
     }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(device),
-                       reinterpret_cast<void *>(host),
-                       sycl::MaxNumTeams * sizeof(T));
+    auto e =
+        q->memcpy(reinterpret_cast<void*>(device),
+                  reinterpret_cast<void*>(host), sycl::MaxNumTeams * sizeof(T));
 
     e.wait();
   }
 
   //! transfers from the device to the host -- exit() is called upon failure
-  RAJA_INLINE void deviceToHost(Offload_Info &info)
+  RAJA_INLINE void deviceToHost(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if(!q) {
+    if (!q)
+    {
       camp::resources::Resource res = camp::resources::Sycl();
       q = res.get<camp::resources::Sycl>().get_queue();
-    } 
+    }
 
     // precondition: host and device are valid pointers
-    auto e = q->memcpy(reinterpret_cast<void *>(host),
-                       reinterpret_cast<void *>(device),
+    auto e = q->memcpy(reinterpret_cast<void*>(host),
+                       reinterpret_cast<void*>(device),
                        sycl::MaxNumTeams * sizeof(T));
- 
+
     e.wait();
   }
 
   //! frees all data from the offload information passed
-  RAJA_INLINE void cleanup(Offload_Info &info)
+  RAJA_INLINE void cleanup(Offload_Info& info)
   {
     cl::sycl::queue* q = ::camp::resources::Sycl::get_default().get_queue();
 
-    if (device) {
-      cl::sycl::free(reinterpret_cast<void *>(device), *q);
+    if (device)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(device), *q);
       device = nullptr;
     }
-    if (host) {
-      cl::sycl::free(reinterpret_cast<void *>(host), *q);
-      //delete[] host;
+    if (host)
+    {
+      cl::sycl::free(reinterpret_cast<void*>(host), *q);
+      // delete[] host;
       host = nullptr;
     }
   }
@@ -190,79 +200,87 @@ struct Reduce_Data
 
 //! SYCL Target Reduction entity -- generalize on # of teams, reduction, and
 //! type
-template <typename Reducer, typename T>
-struct TargetReduce 
+template<typename Reducer, typename T>
+struct TargetReduce
 {
-  TargetReduce() = delete;
-  TargetReduce(const TargetReduce &) = default;
+  TargetReduce()                    = delete;
+  TargetReduce(const TargetReduce&) = default;
 
   explicit TargetReduce(T init_val)
       : info(),
         val(Reducer::identity(), Reducer::identity(), info),
         initVal(init_val),
         finalVal(Reducer::identity())
-  {
-  }
+  {}
 
   void reset(T init_val_, T identity_ = Reducer::identity())
   {
     val.cleanup(info);
-    val = sycl::Reduce_Data<T>(identity_, identity_, info);
+    val           = sycl::Reduce_Data<T>(identity_, identity_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_;
+    initVal       = init_val_;
+    finalVal      = identity_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduce()
-  {
-  }
+  ~TargetReduce() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
-      for (int i =0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, val.host[i]);
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, val.host[i]);
       }
-//      val.cleanup(info);
+      //      val.cleanup(info);
       info.isMapped = true;
     }
     finalVal = Reducer::identity();
-    Reducer{}(finalVal, initVal);
-    Reducer{}(finalVal, val.value);
+    Reducer {}(finalVal, initVal);
+    Reducer {}(finalVal, val.value);
     T returnVal = finalVal;
     reset(finalVal);
     return returnVal;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
   //! apply reduction
-  TargetReduce &reduce(T rhsVal)
+  TargetReduce& reduce(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduce &reduce(T rhsVal) const
+  const TargetReduce& reduce(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(val.device[i]);
-    Reducer{}(atm, rhsVal);  
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            val.device[i]);
+    Reducer {}(atm, rhsVal);
     return *this;
 #else
-    Reducer{}(val.value, rhsVal);
+    Reducer {}(val.value, rhsVal);
     return *this;
 #endif
   }
@@ -280,14 +298,18 @@ struct TargetReduce
 
 //! SYCL Target Reduction Location entity -- generalize on # of teams,
 //! reduction, and type
-template <typename Reducer, typename T, typename IndexType>
-struct TargetReduceLoc 
+template<typename Reducer, typename T, typename IndexType>
+struct TargetReduceLoc
 {
-  TargetReduceLoc() = delete;
-  TargetReduceLoc(const TargetReduceLoc &) = default;
-  explicit TargetReduceLoc(T init_val, IndexType init_loc,
-                           T identity_val_ = Reducer::identity,
-                           IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+  TargetReduceLoc()                       = delete;
+  TargetReduceLoc(const TargetReduceLoc&) = default;
+
+  explicit TargetReduceLoc(
+      T init_val,
+      IndexType init_loc,
+      T identity_val_ = Reducer::identity,
+      IndexType identity_loc_ =
+          RAJA::reduce::detail::DefaultLoc<IndexType>().value())
       : info(),
         val(identity_val_, identity_val_, info),
         loc(identity_loc_, identity_loc_, info),
@@ -295,54 +317,55 @@ struct TargetReduceLoc
         finalVal(identity_val_),
         initLoc(init_loc),
         finalLoc(identity_loc_)
-  {
-  }
+  {}
 
-  void reset(T init_val_, IndexType init_loc_,
+  void reset(T init_val_,
+             IndexType init_loc_,
              T identity_val_ = Reducer::identity,
-             IndexType identity_loc_ = RAJA::reduce::detail::DefaultLoc<IndexType>().value())
+             IndexType identity_loc_ =
+                 RAJA::reduce::detail::DefaultLoc<IndexType>().value())
   {
     val.cleanup(info);
     val = sycl::Reduce_Data<T>(identity_val_, identity_val_, info);
     loc.cleanup(info);
     loc = sycl::Reduce_Data<IndexType>(identity_loc_, identity_loc_, info);
     info.isMapped = false;
-    initVal = init_val_;
-    finalVal = identity_val_;
-    initLoc = init_loc_;
-    finalLoc = identity_loc_;
+    initVal       = init_val_;
+    finalVal      = identity_val_;
+    initLoc       = init_loc_;
+    finalLoc      = identity_loc_;
   }
 
   //! apply reduction on device upon destruction
-  ~TargetReduceLoc()
-  {
-  }
+  ~TargetReduceLoc() {}
 
   //! map result value back to host if not done already; return aggregate value
   operator T()
   {
-    if (!info.isMapped) {
+    if (!info.isMapped)
+    {
       val.deviceToHost(info);
       loc.deviceToHost(info);
-      
-      for (int i = 0; i < sycl::MaxNumTeams; ++i) {
-        Reducer{}(val.value, loc.value, val.host[i], loc.host[i]);
+
+      for (int i = 0; i < sycl::MaxNumTeams; ++i)
+      {
+        Reducer {}(val.value, loc.value, val.host[i], loc.host[i]);
       }
       info.isMapped = true;
     }
     finalVal = Reducer::identity;
     finalLoc = IndexType(RAJA::reduce::detail::DefaultLoc<IndexType>().value());
-    Reducer{}(finalVal, finalLoc, initVal, initLoc);
-    Reducer{}(finalVal, finalLoc, val.value, loc.value);
+    Reducer {}(finalVal, finalLoc, initVal, initLoc);
+    Reducer {}(finalVal, finalLoc, val.value, loc.value);
     returnVal = finalVal;
     returnLoc = finalLoc;
     reset(finalVal, finalLoc);
     return returnVal;
   }
+
   //! alias for operator T()
   T get() { return operator T(); }
 
-
   //! map result value back to host if not done already; return aggregate
   //! location
   IndexType getLoc()
@@ -353,24 +376,26 @@ struct TargetReduceLoc
   }
 
   //! apply reduction
-  TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc)
+  TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0; //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire, cl::sycl::memory_scope::device);
-    Reducer{}(val.device[i], loc.device[i], rhsVal, rhsLoc);
-    cl::sycl::atomic_fence(cl::sycl::memory_order_release, cl::sycl::memory_scope::device);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    cl::sycl::atomic_fence(cl::sycl::memory_order_acquire,
+                           cl::sycl::memory_scope::device);
+    Reducer {}(val.device[i], loc.device[i], rhsVal, rhsLoc);
+    cl::sycl::atomic_fence(cl::sycl::memory_order_release,
+                           cl::sycl::memory_scope::device);
     return *this;
 #else
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
 #endif
   }
 
   //! apply reduction (const version) -- still reduces internal values
-  const TargetReduceLoc &reduce(T rhsVal, IndexType rhsLoc) const
+  const TargetReduceLoc& reduce(T rhsVal, IndexType rhsLoc) const
   {
-    Reducer{}(val.value, loc.value, rhsVal, rhsLoc);
+    Reducer {}(val.value, loc.value, rhsVal, rhsLoc);
     return *this;
   }
 
@@ -382,7 +407,7 @@ struct TargetReduceLoc
   //! storage for offload information
   sycl::Offload_Info info;
   //! storage for reduction data for value
-//  sycl::Reduce_Data<T> val;
+  //  sycl::Reduce_Data<T> val;
   //! storage for redcution data for location
   T initVal;
   T finalVal;
@@ -392,31 +417,32 @@ struct TargetReduceLoc
   IndexType returnLoc;
 };
 
-
 //! specialization of ReduceSum for omp_target_reduce
-template <typename T>
-class ReduceSum<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::sum<T>, T>
+template<typename T>
+class ReduceSum<sycl_reduce, T> : public TargetReduce<RAJA::reduce::sum<T>, T>
 {
 public:
-
-  using self = ReduceSum<sycl_reduce, T>;
+  using self   = ReduceSum<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::sum<T>, T>;
   using parent::parent;
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  self &operator+=(T rhsVal)
+  self& operator+=(T rhsVal)
   {
     parent::reduce(rhsVal);
     return *this;
   }
 
   //! enable operator+= for ReduceSum -- alias for reduce()
-  const self &operator+=(T rhsVal) const
+  const self& operator+=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_add(rhsVal);
     return *this;
 #else
@@ -427,22 +453,25 @@ class ReduceSum<sycl_reduce, T>
 };
 
 //! specialization of ReduceBitOr for sycl_reduce
-template <typename T>
+template<typename T>
 class ReduceBitOr<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::or_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitOr<sycl_reduce, T>;
+  using self   = ReduceBitOr<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::or_bit<T>, T>;
   using parent::parent;
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  self &operator|=(T rhsVal)
+  self& operator|=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -452,11 +481,15 @@ class ReduceBitOr<sycl_reduce, T>
   }
 
   //! enable operator|= for ReduceBitOr -- alias for reduce()
-  const self &operator|=(T rhsVal) const
+  const self& operator|=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm |= rhsVal;
     return *this;
 #else
@@ -467,22 +500,25 @@ class ReduceBitOr<sycl_reduce, T>
 };
 
 //! specialization of ReduceBitAnd for sycl_reduce
-template <typename T>
+template<typename T>
 class ReduceBitAnd<sycl_reduce, T>
     : public TargetReduce<RAJA::reduce::and_bit<T>, T>
 {
 public:
-
-  using self = ReduceBitAnd<sycl_reduce, T>;
+  using self   = ReduceBitAnd<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::and_bit<T>, T>;
   using parent::parent;
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  self &operator&=(T rhsVal)
+  self& operator&=(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -492,11 +528,15 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 
   //! enable operator&= for ReduceBitAnd -- alias for reduce()
-  const self &operator&=(T rhsVal) const
+  const self& operator&=(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm &= rhsVal;
     return *this;
 #else
@@ -506,24 +546,25 @@ class ReduceBitAnd<sycl_reduce, T>
   }
 };
 
-
 //! specialization of ReduceMin for omp_target_reduce
-template <typename T>
-class ReduceMin<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::min<T>, T>
+template<typename T>
+class ReduceMin<sycl_reduce, T> : public TargetReduce<RAJA::reduce::min<T>, T>
 {
 public:
-
-  using self = ReduceMin<sycl_reduce, T>;
+  using self   = ReduceMin<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::min<T>, T>;
   using parent::parent;
 
   //! enable min() for ReduceMin -- alias for reduce()
-  self &min(T rhsVal)
+  self& min(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -533,11 +574,15 @@ class ReduceMin<sycl_reduce, T>
   }
 
   //! enable min() for ReduceMin -- alias for reduce()
-  const self &min(T rhsVal) const
+  const self& min(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_min(rhsVal);
     return *this;
 #else
@@ -547,24 +592,25 @@ class ReduceMin<sycl_reduce, T>
   }
 };
 
-
 //! specialization of ReduceMax for omp_target_reduce
-template <typename T>
-class ReduceMax<sycl_reduce, T>
-    : public TargetReduce<RAJA::reduce::max<T>, T>
+template<typename T>
+class ReduceMax<sycl_reduce, T> : public TargetReduce<RAJA::reduce::max<T>, T>
 {
 public:
-
-  using self = ReduceMax<sycl_reduce, T>;
+  using self   = ReduceMax<sycl_reduce, T>;
   using parent = TargetReduce<RAJA::reduce::max<T>, T>;
   using parent::parent;
 
   //! enable max() for ReduceMax -- alias for reduce()
-  self &max(T rhsVal)
+  self& max(T rhsVal)
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
@@ -574,11 +620,15 @@ class ReduceMax<sycl_reduce, T>
   }
 
   //! enable max() for ReduceMax -- alias for reduce()
-  const self &max(T rhsVal) const
+  const self& max(T rhsVal) const
   {
 #ifdef __SYCL_DEVICE_ONLY__
-    auto i = 0;//__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
-    auto atm = ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel, cl::sycl::memory_scope::device, cl::sycl::access::address_space::global_space>(parent::val.device[i]);
+    auto i = 0;  //__spirv::initLocalInvocationId<1, cl::sycl::id<1>>()[0];
+    auto atm =
+        ::sycl::atomic_ref<T, cl::sycl::memory_order_acq_rel,
+                           cl::sycl::memory_scope::device,
+                           cl::sycl::access::address_space::global_space>(
+            parent::val.device[i]);
     atm.fetch_max(rhsVal);
     return *this;
 #else
diff --git a/include/RAJA/policy/tensor/arch.hpp b/include/RAJA/policy/tensor/arch.hpp
index 771adea64f..508c62dfb5 100644
--- a/include/RAJA/policy/tensor/arch.hpp
+++ b/include/RAJA/policy/tensor/arch.hpp
@@ -23,26 +23,28 @@
 namespace RAJA
 {
 
-namespace internal {
+namespace internal
+{
 
 namespace expt
 {
 
 
-  /*!
-   * Provides architectural details for a given architecture and data type.
-   */
-  template<typename REGISTER_POLICY, typename T>
-  struct RegisterTraits;
-  /*
-   * using element_type = T;
-   * using register_policy = REGISTER_POLICY;
-   * static constexpr camp::idx s_num_bits = X;
-   * static constexpr camp::idx s_num_elem = Y;
-   *
-   */
-} //namespace expt
-} //namespace internal
+/*!
+ * Provides architectural details for a given architecture and data type.
+ */
+template<typename REGISTER_POLICY, typename T>
+struct RegisterTraits;
+/*
+ * using element_type = T;
+ * using register_policy = REGISTER_POLICY;
+ * static constexpr camp::idx s_num_bits = X;
+ * static constexpr camp::idx s_num_elem = Y;
+ *
+ */
+}  // namespace expt
+}  // namespace internal
+
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -54,7 +56,8 @@ namespace expt
 {
 
 #ifdef __AVX512F__
-struct avx512_register {};
+struct avx512_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx512_register
@@ -63,7 +66,8 @@ struct avx512_register {};
 
 
 #ifdef __AVX2__
-struct avx2_register {};
+struct avx2_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx2_register
@@ -72,7 +76,8 @@ struct avx2_register {};
 
 
 #ifdef __AVX__
-struct avx_register {};
+struct avx_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::avx_register
@@ -85,7 +90,8 @@ struct avx_register {};
 /*!
  * A CUDA warp distributed vector register
  */
-struct cuda_warp_register {};
+struct cuda_warp_register
+{};
 
 #endif
 
@@ -96,12 +102,14 @@ struct cuda_warp_register {};
  * A HIP wavefront distributed vector register
  * On AMD GPUs this is rally just a vector register
  */
-struct hip_wave_register {};
+struct hip_wave_register
+{};
 
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-struct scalar_register {};
+struct scalar_register
+{};
 
 #ifndef RAJA_TENSOR_REGISTER_TYPE
 #define RAJA_TENSOR_REGISTER_TYPE RAJA::expt::scalar_register
@@ -109,14 +117,12 @@ struct scalar_register {};
 #endif
 
 
-  // This sets the default SIMD register that will be used
-  using default_register = RAJA_TENSOR_REGISTER_TYPE;
-
-
-} // namespace expt
-} // namespace RAJA
+// This sets the default SIMD register that will be used
+using default_register = RAJA_TENSOR_REGISTER_TYPE;
 
 
+}  // namespace expt
+}  // namespace RAJA
 
 //
 // Now include all of the traits files
diff --git a/include/RAJA/policy/tensor/arch/avx.hpp b/include/RAJA/policy/tensor/arch/avx.hpp
index ed25f1f3e3..c0df27fac9 100644
--- a/include/RAJA/policy/tensor/arch/avx.hpp
+++ b/include/RAJA/policy/tensor/arch/avx.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX__
 
-#include<RAJA/policy/tensor/arch/avx/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_float.hpp>
-#include<RAJA/policy/tensor/arch/avx/avx_double.hpp>
+#include <RAJA/policy/tensor/arch/avx/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_float.hpp>
+#include <RAJA/policy/tensor/arch/avx/avx_double.hpp>
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
index 8a23d66e57..f57ef035bb 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_double.hpp
@@ -28,450 +28,466 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<>
-  class Register<double, avx_register> :
-    public internal::expt::RegisterBase<Register<double, avx_register>>
+template<>
+class Register<double, avx_register>
+    : public internal::expt::RegisterBase<Register<double, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<double, avx_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : base_type(),
+        m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_pd();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    };
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<double, avx_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-                     base_type(), m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_pd();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        };
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the maximum value of each lane
-        // B = { max{v[0], v[1]},
-        //       max{v[0], v[1]},
-        //       max{v[2], v[3]},
-        //       max{v[2], v[3]} }
-        register_type b = _mm256_max_pd(m_value, a);
-
-        // now take the maximum of a lower and upper halves
-        return RAJA::max<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return RAJA::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the maximum value of each lane
+    // B = { max{v[0], v[1]},
+    //       max{v[0], v[1]},
+    //       max{v[2], v[3]},
+    //       max{v[2], v[3]} }
+    register_type b = _mm256_max_pd(m_value, a);
+
+    // now take the maximum of a lower and upper halves
+    return RAJA::max<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return RAJA::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
index 1e6563742a..6ea7188f3a 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_float.hpp
@@ -28,463 +28,484 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<>
-  class Register<float, avx_register> :
-    public internal::expt::RegisterBase<Register<float, avx_register>>
+template<>
+class Register<float, avx_register>
+    : public internal::expt::RegisterBase<Register<float, avx_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<float, avx_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_ps();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<float, avx_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_ps();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element of first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::max<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::max<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::max<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::max<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // swap odd-even pairs and combine
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and combine
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        // combine quads
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return RAJA::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-
-        if(N == 7){
-          // blend out the 8th lane of the permute
-          sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
-        }
-
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // Some more simple shortcuts
-        if(N == 3){
-          return RAJA::min<element_type>(red1[0], m_value[2]);
-        }
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        if(N == 4){
-          return red2[0];
-        }
-        if(N == 5){
-          return RAJA::min<element_type>(red2[0], m_value[4]);
-        }
-        if(N == 6){
-          return RAJA::min<element_type>(red2[0], red1[4]);
-        }
-
-        // 7 or 8 lanes
-        return RAJA::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element of first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::max<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::max<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::max<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // swap odd-even pairs and combine
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and combine
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    // combine quads
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return RAJA::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_permute_ps(m_value, 0xB1);
+
+    if (N == 7)
+    {
+      // blend out the 8th lane of the permute
+      sh1 = _mm256_blend_ps(sh1, m_value, 0x40);
+    }
+
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // Some more simple shortcuts
+    if (N == 3)
+    {
+      return RAJA::min<element_type>(red1[0], m_value[2]);
+    }
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    if (N == 4)
+    {
+      return red2[0];
+    }
+    if (N == 5)
+    {
+      return RAJA::min<element_type>(red2[0], m_value[4]);
+    }
+    if (N == 6)
+    {
+      return RAJA::min<element_type>(red2[0], red1[4]);
+    }
+
+    // 7 or 8 lanes
+    return RAJA::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX__
+#endif  //__AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
index 11ab97be16..fc5a7a1a58 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int32.hpp
@@ -28,743 +28,765 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+template<>
+class Register<int32_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int32_t, avx_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      set(ptr[i * stride], i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N),
+                        reinterpret_cast<__m256>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int32_t, avx_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i], i);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 8;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          set(ptr[i*stride], i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(reinterpret_cast<float*>(ptr), createMask(N), reinterpret_cast<__m256>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 8-way 32-bit add, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // no 8-way 32-bit multiply, but there is a 32x32 -> 64
-        // This gets ugly :)
-
-        // Low 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        // multiply even lanes 0, 2
-        auto res_low_even = _mm_mul_epi32(low_a, low_b);
-
-        // multiply odd lanes 1, 3
-        auto low_a_sh = _mm_shuffle_epi32(low_a, 0xB1);
-        auto low_b_sh = _mm_shuffle_epi32(low_b, 0xB1);
-        auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_low_odd = _mm_shuffle_epi32(res_low_odd, 0xB1);
-        auto res_low = _mm256_castsi128_si256(_mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_low_odd),
-                         _mm_castsi128_ps(res_low_even),
-                         0x05)
-            ));
-
-
-        // High 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        // multiply even lanes 0, 2
-        auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
-
-        // multiply odd lanes 1, 3
-        auto hi_a_sh = _mm_shuffle_epi32(hi_a, 0xB1);
-        auto hi_b_sh = _mm_shuffle_epi32(hi_b, 0xB1);
-        auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
-
-        // recombine to get all 4 lanes
-        // note: AVX doesn't have a int32 blend, so we use the float32 blend
-        res_hi_odd = _mm_shuffle_epi32(res_hi_odd, 0xB1);
-        auto res_hi = _mm_castps_si128(
-            _mm_blend_ps(_mm_castsi128_ps(res_hi_odd),
-                         _mm_castsi128_ps(res_hi_even),
-                         0x05)
-            );
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_add_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-        auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_add_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
-
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract final reduction
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        return _mm256_extract_epi32(m_value, 0);
+      case 1:
+        return _mm256_extract_epi32(m_value, 1);
+      case 2:
+        return _mm256_extract_epi32(m_value, 2);
+      case 3:
+        return _mm256_extract_epi32(m_value, 3);
+      case 4:
+        return _mm256_extract_epi32(m_value, 4);
+      case 5:
+        return _mm256_extract_epi32(m_value, 5);
+      case 6:
+        return _mm256_extract_epi32(m_value, 6);
+      case 7:
+        return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        m_value = _mm256_insert_epi32(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi32(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi32(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi32(m_value, value, 3);
+        break;
+      case 4:
+        m_value = _mm256_insert_epi32(m_value, value, 4);
+        break;
+      case 5:
+        m_value = _mm256_insert_epi32(m_value, value, 5);
+        break;
+      case 6:
+        m_value = _mm256_insert_epi32(m_value, value, 6);
+        break;
+      case 7:
+        m_value = _mm256_insert_epi32(m_value, value, 7);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 8-way 32-bit add, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi32(low_a, low_b));
 
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi32(hi_a, hi_b);
 
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_max_epi32(low, low_sh1);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
 
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
+  RAJA_HOST_DEVICE
 
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    // no 8-way 32-bit subtract, but there is a 4-way... split and conquer
+
+    // Low 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
 
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // no 8-way 32-bit multiply, but there is a 32x32 -> 64
+    // This gets ugly :)
+
+    // Low 128-bits
+    auto low_a = _mm256_castsi256_si128(m_value);
+    auto low_b = _mm256_castsi256_si128(b.m_value);
+    // multiply even lanes 0, 2
+    auto res_low_even = _mm_mul_epi32(low_a, low_b);
+
+    // multiply odd lanes 1, 3
+    auto low_a_sh    = _mm_shuffle_epi32(low_a, 0xB1);
+    auto low_b_sh    = _mm_shuffle_epi32(low_b, 0xB1);
+    auto res_low_odd = _mm_mul_epi32(low_a_sh, low_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_low_odd  = _mm_shuffle_epi32(res_low_odd, 0xB1);
+    auto res_low = _mm256_castsi128_si256(_mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_low_odd), _mm_castsi128_ps(res_low_even), 0x05)));
+
+
+    // High 128-bits
+    auto hi_a = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
+    // multiply even lanes 0, 2
+    auto res_hi_even = _mm_mul_epi32(hi_a, hi_b);
+
+    // multiply odd lanes 1, 3
+    auto hi_a_sh    = _mm_shuffle_epi32(hi_a, 0xB1);
+    auto hi_b_sh    = _mm_shuffle_epi32(hi_b, 0xB1);
+    auto res_hi_odd = _mm_mul_epi32(hi_a_sh, hi_b_sh);
+
+    // recombine to get all 4 lanes
+    // note: AVX doesn't have a int32 blend, so we use the float32 blend
+    res_hi_odd  = _mm_shuffle_epi32(res_hi_odd, 0xB1);
+    auto res_hi = _mm_castps_si128(_mm_blend_ps(
+        _mm_castsi128_ps(res_hi_odd), _mm_castsi128_ps(res_hi_even), 0x05));
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
 
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
 
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_add_epi32(low, low_sh1);
+
+    auto low_sh2  = _mm_shuffle_epi32(low_red1, 0x1B);
+    auto low_red2 = _mm_add_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_add_epi32(hi, hi_sh1);
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_add_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_add_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract final reduction
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_max_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_max_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_max_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_max_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_max_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_max_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_max_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_max_epi32(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    // this is just painful, since we don't have a proper masked permute
+    // in AVX.  Lots of special cases to make sure we compare just the
+    // right lanes
+    if (N == 1)
+    {
+      return _mm256_extract_epi32(m_value, 0);
+    }
+
+    // Low 128-bits
+    auto low = _mm256_castsi256_si128(m_value);
+
+    auto low_sh1  = _mm_shuffle_epi32(low, 0xB1);
+    auto low_red1 = _mm_min_epi32(low, low_sh1);
+
+    if (N == 2)
+    {
+      return _mm_extract_epi32(low_red1, 0);
+    }
+
+    if (N == 3)
+    {
+      // get lane 2 into lane 0
+      auto low_sh1a  = _mm_shuffle_epi32(low, 0x2);
+      auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
+      return _mm_extract_epi32(low_red1a, 0);
+    }
+
+    auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
+
+    // lane 0 of low_red2 now has reduction of 0,1,2,3
+    auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
+
+    if (N == 4)
+    {
+      return _mm_extract_epi32(low_red2, 0);
+    }
+
+    // High 128-bits
+    auto hi = _mm256_extractf128_si256(m_value, 1);
+
+    if (N == 5)
+    {
+      auto red_5 = _mm_min_epi32(low_red2, hi);
+      return _mm_extract_epi32(red_5, 0);
+    }
+
+    auto hi_sh1  = _mm_shuffle_epi32(hi, 0xB1);
+    auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
+
+    if (N == 6)
+    {
+      auto red_6 = _mm_min_epi32(low_red2, hi_red1);
+      return _mm_extract_epi32(red_6, 0);
+    }
+    if (N == 7)
+    {
+      // get lane 6 (lane 2 of hi) into lane 0
+      auto hi_sh7   = _mm_shuffle_epi32(hi, 0x2);
+      auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
+      auto red_7    = _mm_min_epi32(low_red2, hi_red_6);
+      return _mm_extract_epi32(red_7, 0);
+    }
+
+    auto hi_sh2  = _mm_shuffle_epi32(hi_red1, 0x1B);
+    auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+
+
+    // Sum halves, extract total sum
+    auto hi_low = _mm_min_epi32(hi_red2, low_red2);
+    return _mm_extract_epi32(hi_low, 0);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type b) const
+  {
+    // no 8-way 32-bit min, but there is a 4-way... split and conquer
 
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
 
-        if(N==5){
-          auto red_5 = _mm_max_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_max_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_max_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_max_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_max_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_min_epi32(hi_a, hi_b);
 
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_max_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_max_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_max_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_max_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
-
-
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        // this is just painful, since we don't have a proper masked permute
-        // in AVX.  Lots of special cases to make sure we compare just the
-        // right lanes
-        if(N==1){
-          return _mm256_extract_epi32(m_value, 0);
-        }
-
-        // Low 128-bits
-        auto low = _mm256_castsi256_si128(m_value);
-
-        auto low_sh1 = _mm_shuffle_epi32(low, 0xB1);
-        auto low_red1 = _mm_min_epi32(low, low_sh1);
-
-        if(N==2){
-          return _mm_extract_epi32(low_red1, 0);
-        }
-
-        if(N==3){
-          // get lane 2 into lane 0
-          auto low_sh1a = _mm_shuffle_epi32(low, 0x2);
-          auto low_red1a = _mm_min_epi32(low_red1, low_sh1a);
-          return _mm_extract_epi32(low_red1a, 0);
-        }
-
-        auto low_sh2 = _mm_shuffle_epi32(low_red1, 0x1B);
-
-        // lane 0 of low_red2 now has reduction of 0,1,2,3
-        auto low_red2 = _mm_min_epi32(low_red1, low_sh2);
-
-        if(N==4){
-          return _mm_extract_epi32(low_red2, 0);
-        }
-
-        // High 128-bits
-        auto hi = _mm256_extractf128_si256(m_value, 1);
-
-        if(N==5){
-          auto red_5 = _mm_min_epi32(low_red2, hi);
-          return _mm_extract_epi32(red_5, 0);
-        }
-
-        auto hi_sh1 = _mm_shuffle_epi32(hi, 0xB1);
-        auto hi_red1 = _mm_min_epi32(hi, hi_sh1);
-
-        if(N==6){
-          auto red_6 = _mm_min_epi32(low_red2, hi_red1);
-          return _mm_extract_epi32(red_6, 0);
-        }
-        if(N==7){
-          // get lane 6 (lane 2 of hi) into lane 0
-          auto hi_sh7 = _mm_shuffle_epi32(hi, 0x2);
-          auto hi_red_6 = _mm_min_epi32(hi_sh7, hi_red1);
-          auto red_7 = _mm_min_epi32(low_red2, hi_red_6);
-          return _mm_extract_epi32(red_7, 0);
-        }
-
-        auto hi_sh2 = _mm_shuffle_epi32(hi_red1, 0x1B);
-        auto hi_red2 = _mm_min_epi32(hi_red1, hi_sh2);
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+};
 
 
-        // Sum halves, extract total sum
-        auto hi_low = _mm_min_epi32(hi_red2, low_red2);
-        return _mm_extract_epi32(hi_low, 0);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type b) const
-      {
-        // no 8-way 32-bit min, but there is a 4-way... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_min_epi32(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_min_epi32(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-  };
-
-
-}   // namespace expt
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
index 1c7fae3dc7..463ef595fa 100644
--- a/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/avx_int64.hpp
@@ -28,511 +28,526 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+template<>
+class Register<int64_t, avx_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx_register>>;
+
+  using register_policy = avx_register;
+  using self_type       = Register<int64_t, avx_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template<int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_setzero_si256();
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      m_value[i] = ptr[i * stride];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N),
+                        reinterpret_cast<__m256d>(m_value));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        return _mm256_extract_epi64(m_value, 0);
+      case 1:
+        return _mm256_extract_epi64(m_value, 1);
+      case 2:
+        return _mm256_extract_epi64(m_value, 2);
+      case 3:
+        return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        m_value = _mm256_insert_epi64(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi64(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi64(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi64(m_value, value, 3);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx_register>>;
-
-      using register_policy = avx_register;
-      using self_type = Register<int64_t, avx_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : base_type(),  m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        for(camp::idx_t i = 0;i < 4;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_setzero_si256();
-        for(camp::idx_t i = 0;i < N;++ i){
-          m_value[i] = ptr[i*stride];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_pd(reinterpret_cast<double*>(ptr), createMask(N), reinterpret_cast<__m256d>(m_value));
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        // no 4-way 64-bit add, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_add_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
-
-        // Low 128-bits  - use _mm256_castsi256_si128???
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(b.m_value);
-        auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
-
-        // Hi 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(b.m_value, 1);
-        auto res_hi = _mm_sub_epi64(hi_a, hi_b);
-
-        // Stitch back together
-        return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-
-        // Add lower 128-bits
-        auto low_a = _mm256_castsi256_si128(m_value);
-        auto low_b = _mm256_castsi256_si128(sh1);
-        auto res_low = _mm_add_epi64(low_a, low_b);
-
-        // Add upper 128-bits
-        auto hi_a = _mm256_extractf128_si256(m_value, 1);
-        auto hi_b = _mm256_extractf128_si256(sh1, 1);
-        auto res_hi = _mm_add_epi64(hi_a, hi_b);
-
-        // Sum upper and lower
-        auto res = _mm_add_epi64(res_hi, res_low);
-
-        // add lower and upper
-        return _mm_extract_epi64(res, 0);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max!
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    // no 4-way 64-bit subtract, but there is a 2-way SSE... split and conquer
+
+    // Low 128-bits  - use _mm256_castsi256_si128???
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(b.m_value);
+    auto res_low = _mm256_castsi128_si256(_mm_sub_epi64(low_a, low_b));
+
+    // Hi 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(b.m_value, 1);
+    auto res_hi = _mm_sub_epi64(hi_a, hi_b);
+
+    // Stitch back together
+    return self_type(_mm256_insertf128_si256(res_low, res_hi, 1));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap pairs and add
+    auto sh1 = permute<0x5>(m_value);
+
+    // Add lower 128-bits
+    auto low_a   = _mm256_castsi256_si128(m_value);
+    auto low_b   = _mm256_castsi256_si128(sh1);
+    auto res_low = _mm_add_epi64(low_a, low_b);
+
+    // Add upper 128-bits
+    auto hi_a   = _mm256_extractf128_si256(m_value, 1);
+    auto hi_b   = _mm256_extractf128_si256(sh1, 1);
+    auto res_hi = _mm_add_epi64(hi_a, hi_b);
+
+    // Sum upper and lower
+    auto res = _mm_add_epi64(res_hi, res_low);
+
+    // add lower and upper
+    return _mm_extract_epi64(res, 0);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max!
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx/traits.hpp b/include/RAJA/policy/tensor/arch/avx/traits.hpp
index 33c18e2c5f..be8dac8a7a 100644
--- a/include/RAJA/policy/tensor/arch/avx/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx/traits.hpp
@@ -20,52 +20,59 @@
 #ifndef RAJA_policy_tensor_arch_avx_traits_HPP
 #define RAJA_policy_tensor_arch_avx_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // __AVX__
+#endif  // __AVX__
diff --git a/include/RAJA/policy/tensor/arch/avx2.hpp b/include/RAJA/policy/tensor/arch/avx2.hpp
index b462257924..4ae2ca6bdd 100644
--- a/include/RAJA/policy/tensor/arch/avx2.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2.hpp
@@ -17,11 +17,11 @@
 
 #ifdef __AVX2__
 
-#include<RAJA/policy/tensor/arch/avx2/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
-#include<RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
+#include <RAJA/policy/tensor/arch/avx2/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_float.hpp>
+#include <RAJA/policy/tensor/arch/avx2/avx2_double.hpp>
 
 
-#endif // __AVX2__
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
index 852003a4f9..47f9ca5d3f 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_double.hpp
@@ -28,535 +28,553 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<>
-  class Register<double, avx2_register> :
-    public internal::expt::RegisterBase<Register<double, avx2_register>>
+template<>
+class Register<double, avx2_register>
+    : public internal::expt::RegisterBase<Register<double, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<double, avx2_register>;
+  using element_type    = double;
+  using register_type   = __m256d;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_pd()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_pd(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_pd(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<double, avx2_register>;
-      using element_type = double;
-      using register_type = __m256d;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_pd(x3,x2,x1,x0))
-      {}
-
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed ++;
+    RAJA::tensor_stats::num_vector_load_packed++;
 #endif
-        m_value = _mm256_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
+    m_value = _mm256_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_packed_n ++;
+    RAJA::tensor_stats::num_vector_load_packed_n++;
 #endif
-        m_value = _mm256_maskload_pd(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
+    m_value = _mm256_maskload_pd(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided ++;
+    RAJA::tensor_stats::num_vector_load_strided++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
+    m_value = _mm256_i64gather_pd(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_pd(ptr,
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_pd(ptr, offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_pd(_mm256_setzero_pd(),
-                                      ptr,
-                                      offsets.get_register(),
-                                      _mm256_castsi256_pd(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
+    m_value = _mm256_mask_i64gather_pd(
+        _mm256_setzero_pd(), ptr, offsets.get_register(),
+        _mm256_castsi256_pd(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed ++;
+    RAJA::tensor_stats::num_vector_store_packed++;
 #endif
-        _mm256_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
+    _mm256_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_packed_n ++;
+    RAJA::tensor_stats::num_vector_store_packed_n++;
 #endif
-        _mm256_maskstore_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
+    _mm256_maskstore_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided ++;
+    RAJA::tensor_stats::num_vector_store_strided++;
 #endif
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_store_strided_n ++;
+    RAJA::tensor_stats::num_vector_store_strided_n++;
 #endif
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_pd(value);
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        switch(i){
-          case 0: return self_type(_mm256_permute4x64_pd (m_value, 0x00));
-          case 1: return self_type(_mm256_permute4x64_pd (m_value, 0x55));
-          case 2: return self_type(_mm256_permute4x64_pd (m_value, 0xAA));
-          case 3: return self_type(_mm256_permute4x64_pd (m_value, 0xFF));
-        }
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply a masked divide, so do it manually
-        return self_type(_mm256_set_pd(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_pd(value);
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    switch (i)
+    {
+      case 0:
+        return self_type(_mm256_permute4x64_pd(m_value, 0x00));
+      case 1:
+        return self_type(_mm256_permute4x64_pd(m_value, 0x55));
+      case 2:
+        return self_type(_mm256_permute4x64_pd(m_value, 0xAA));
+      case 3:
+        return self_type(_mm256_permute4x64_pd(m_value, 0xFF));
+    }
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply a masked divide, so do it manually
+    return self_type(_mm256_set_pd(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum(camp::idx_t = 4) const
-      {
-        auto sh1 = _mm256_permute_pd(m_value, 0x5);
-        auto red1 = _mm256_add_pd(m_value, sh1);
-        return red1[0]+red1[2];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max(camp::idx_t N = 4) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[3]},
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper halves
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the maximum value of each lane
-          // B = { max{v[0], v[1]},
-          //       max{v[0], v[1]},
-          //       max{v[2], v[2]},   <-- just v[2]
-          //       max{v[2], v[3]} }
-          register_type b = _mm256_max_pd(m_value, a);
-
-          // now take the maximum of a lower and upper lane
-          return RAJA::max<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return RAJA::max<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::min();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // permute the first two and last two lanes of the register
-        // A = { v[1], v[0], v[3], v[2] }
-        register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-        // take the minimum value of each lane
-        // B = { min{v[0], v[1]},
-        //       min{v[0], v[1]},
-        //       min{v[2], v[3]},
-        //       min{v[2], v[3]} }
-        register_type b = _mm256_min_pd(m_value, a);
-
-        // now take the minimum of a lower and upper halves
-        return RAJA::min<element_type>(b[0], b[2]);
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N == 4){
-          // permute the first two and last two lanes of the register
-          // A = { v[1], v[0], v[3], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[3]},
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper halves
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 3){
-          // permute the first two and last two lanes of the register
-          // use the third element TWICE, so we effectively remove the 4th
-          // lane
-          // A = { v[1], v[0], v[2], v[2] }
-          register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
-
-          // take the minimum value of each lane
-          // B = { min{v[0], v[1]},
-          //       min{v[0], v[1]},
-          //       min{v[2], v[2]},   <-- just v[2]
-          //       min{v[2], v[3]} }
-          register_type b = _mm256_min_pd(m_value, a);
-
-          // now take the minimum of a lower and upper lane
-          return std::min<element_type>(b[0], b[2]);
-        }
-        else if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-        else if(N == 1){
-          return m_value[0];
-        }
-        return RAJA::operators::limits<double>::max();
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum(camp::idx_t = 4) const
+  {
+    auto sh1  = _mm256_permute_pd(m_value, 0x5);
+    auto red1 = _mm256_add_pd(m_value, sh1);
+    return red1[0] + red1[2];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max(camp::idx_t N = 4) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[3]},
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper halves
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the maximum value of each lane
+      // B = { max{v[0], v[1]},
+      //       max{v[0], v[1]},
+      //       max{v[2], v[2]},   <-- just v[2]
+      //       max{v[2], v[3]} }
+      register_type b = _mm256_max_pd(m_value, a);
+
+      // now take the maximum of a lower and upper lane
+      return RAJA::max<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return RAJA::max<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::min();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // permute the first two and last two lanes of the register
+    // A = { v[1], v[0], v[3], v[2] }
+    register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+    // take the minimum value of each lane
+    // B = { min{v[0], v[1]},
+    //       min{v[0], v[1]},
+    //       min{v[2], v[3]},
+    //       min{v[2], v[3]} }
+    register_type b = _mm256_min_pd(m_value, a);
+
+    // now take the minimum of a lower and upper halves
+    return RAJA::min<element_type>(b[0], b[2]);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N == 4)
+    {
+      // permute the first two and last two lanes of the register
+      // A = { v[1], v[0], v[3], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x5);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[3]},
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper halves
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 3)
+    {
+      // permute the first two and last two lanes of the register
+      // use the third element TWICE, so we effectively remove the 4th
+      // lane
+      // A = { v[1], v[0], v[2], v[2] }
+      register_type a = _mm256_shuffle_pd(m_value, m_value, 0x3);
+
+      // take the minimum value of each lane
+      // B = { min{v[0], v[1]},
+      //       min{v[0], v[1]},
+      //       min{v[2], v[2]},   <-- just v[2]
+      //       min{v[2], v[3]} }
+      register_type b = _mm256_min_pd(m_value, a);
+
+      // now take the minimum of a lower and upper lane
+      return std::min<element_type>(b[0], b[2]);
+    }
+    else if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+    else if (N == 1)
+    {
+      return m_value[0];
+    }
+    return RAJA::operators::limits<double>::max();
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
index 4b1e11419d..6854c25e04 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_float.hpp
@@ -28,492 +28,493 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx2_register> :
-    public internal::expt::RegisterBase<Register<float, avx2_register>>
+template<>
+class Register<float, avx2_register>
+    : public internal::expt::RegisterBase<Register<float, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<float, avx2_register>;
+  using element_type    = float;
+  using register_type   = __m256;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_ps()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_ps(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_ps(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_ps(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_ps(ptr, createStridedOffsets(stride),
+                                  sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_ps(
+        _mm256_setzero_ps(), ptr, createStridedOffsets(stride),
+        _mm256_castsi256_ps(createMask(N)), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_ps(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm256_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm256_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<float, avx2_register>;
-      using element_type = float;
-      using register_type = __m256;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_ps(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_ps(c)) {}
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_ps(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_ps(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_ps(_mm256_setzero_ps(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      _mm256_castsi256_ps(createMask(N)),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm256_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm256_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX2 does not supply a masked divide
-        return self_type(_mm256_set_ps(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
+    // AVX2 does not supply a masked divide
+    return self_type(_mm256_set_ps(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm256_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permute_ps(m_value, 0xB1);
-        auto red1 = _mm256_add_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permute_ps(red1, 0x4E);
-        auto red2 = _mm256_add_ps(red1, sh2);
-
-        return red2[0] + red2[4];
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::min();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::max<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_max_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::max<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_max_ps(red1, sh2);
-
-        return std::max<element_type>(red2[0], red2[4]);
-
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(8));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N >8){
-          return RAJA::operators::limits<float>::max();
-        }
-        if(N == 1){
-          return m_value[0];
-        }
-        if(N == 2){
-          return std::min<element_type>(m_value[0], m_value[1]);
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
-        auto red1 = _mm256_min_ps(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(red1[0], m_value[2]);
-        }
-        if(N == 4){
-          return std::min<element_type>(red1[0], red1[2]);
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_ps(red1, createPermute2(N));
-        auto red2 = _mm256_min_ps(red1, sh2);
-
-        return std::min<element_type>(red2[0], red2[4]);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permute_ps(m_value, 0xB1);
+    auto red1 = _mm256_add_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permute_ps(red1, 0x4E);
+    auto red2 = _mm256_add_ps(red1, sh2);
+
+    return red2[0] + red2[4];
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::min();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::max<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_max_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_max_ps(red1, sh2);
+
+    return std::max<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(8));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(8));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<float>::max();
+    }
+    if (N == 1)
+    {
+      return m_value[0];
+    }
+    if (N == 2)
+    {
+      return std::min<element_type>(m_value[0], m_value[1]);
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_ps(m_value, createPermute1(N));
+    auto red1 = _mm256_min_ps(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(red1[0], m_value[2]);
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(red1[0], red1[2]);
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_ps(red1, createPermute2(N));
+    auto red2 = _mm256_min_ps(red1, sh2);
+
+    return std::min<element_type>(red2[0], red2[4]);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
index ab5948a3f2..6d9709c9fc 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int32.hpp
@@ -28,541 +28,564 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<>
-  class Register<int32_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+template<>
+class Register<int32_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int32_t, avx2_register>;
+  using element_type    = int32_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int32_t, avx2_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi32(N >= 8 ? -1 : 0, N >= 7 ? -1 : 0, N >= 6 ? -1 : 0,
+                            N >= 5 ? -1 : 0, N >= 4 ? -1 : 0, N >= 3 ? -1 : 0,
+                            N >= 2 ? -1 : 0, N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi32(7 * stride, 6 * stride, 5 * stride, 4 * stride,
+                            3 * stride, 2 * stride, stride, 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute1(camp::idx_t N) const
+  {
+    // Generate a permutation for first round of min/max routines
+    return _mm256_set_epi32(N >= 7 ? 6 : 0, N >= 8 ? 7 : 0, N >= 5 ? 4 : 0,
+                            N >= 6 ? 5 : 0, N >= 3 ? 2 : 0, N >= 4 ? 3 : 0,
+                            N >= 1 ? 0 : 0, N >= 2 ? 1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createPermute2(camp::idx_t N) const
+  {
+    // Generate a permutation for second round of min/max routines
+    return _mm256_set_epi32(N >= 6 ? 5 : 0, N >= 5 ? 4 : 0, N >= 8 ? 7 : 0,
+                            N >= 7 ? 6 : 0, N >= 2 ? 1 : 0, N >= 1 ? 0 : 0,
+                            N >= 4 ? 3 : 0, N >= 2 ? 2 : 0);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0,
+           element_type x1,
+           element_type x2,
+           element_type x3,
+           element_type x4,
+           element_type x5,
+           element_type x6,
+           element_type x7)
+      : m_value(_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi32(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256((__m256i const*)ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_maskload_epi32(ptr, createMask(N));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i32gather_epi32(ptr, createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(), ptr,
+                                          createStridedOffsets(stride),
+                                          createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 8; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = get(i);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        return _mm256_extract_epi32(m_value, 0);
+      case 1:
+        return _mm256_extract_epi32(m_value, 1);
+      case 2:
+        return _mm256_extract_epi32(m_value, 2);
+      case 3:
+        return _mm256_extract_epi32(m_value, 3);
+      case 4:
+        return _mm256_extract_epi32(m_value, 4);
+      case 5:
+        return _mm256_extract_epi32(m_value, 5);
+      case 6:
+        return _mm256_extract_epi32(m_value, 6);
+      case 7:
+        return _mm256_extract_epi32(m_value, 7);
+    }
+    return 0;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        m_value = _mm256_insert_epi32(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi32(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi32(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi32(m_value, value, 3);
+        break;
+      case 4:
+        m_value = _mm256_insert_epi32(m_value, value, 4);
+        break;
+      case 5:
+        m_value = _mm256_insert_epi32(m_value, value, 5);
+        break;
+      case 6:
+        m_value = _mm256_insert_epi32(m_value, value, 6);
+        break;
+      case 7:
+        m_value = _mm256_insert_epi32(m_value, value, 7);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi32(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+
+    // the AVX2 epi32 multiply only multiplies the even elements
+    // and provides 64-bit results
+    // need to do some repacking to get this to work
+
+    // multiply 0, 2, 4, 6
+    auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
+
+    // Swap 32-bit words
+    auto sh_a = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+
+    auto sh_b = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
+
+    // multiply 1, 3, 5, 7
+    auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
+
+    // Stitch prod_odd and prod_even back together
+    auto sh_odd = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
+
+    return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi32(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+    // swap odd-even pairs and add
+    auto sh1 = _mm256_castps_si256(
+        _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
+    auto red1 = _mm256_add_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2 =
+        _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
+    auto red2 = _mm256_add_epi32(red1, sh2);
+
+    return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int32_t, avx2_register>;
-      using element_type = int32_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int32_t, avx2_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi32(
-            N >= 8 ? -1 : 0,
-            N >= 7 ? -1 : 0,
-            N >= 6 ? -1 : 0,
-            N >= 5 ? -1 : 0,
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi32(
-            7*stride, 6*stride, 5*stride, 4*stride,
-            3*stride, 2*stride, stride, 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute1(camp::idx_t N) const {
-        // Generate a permutation for first round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 7 ? 6 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 6 ? 5 : 0,
-            N >= 3 ? 2 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 2 ? 1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createPermute2(camp::idx_t N) const {
-        // Generate a permutation for second round of min/max routines
-        return  _mm256_set_epi32(
-            N >= 6 ? 5 : 0,
-            N >= 5 ? 4 : 0,
-            N >= 8 ? 7 : 0,
-            N >= 7 ? 6 : 0,
-            N >= 2 ? 1 : 0,
-            N >= 1 ? 0 : 0,
-            N >= 4 ? 3 : 0,
-            N >= 2 ? 2 : 0);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3,
-                     element_type x4,
-                     element_type x5,
-                     element_type x6,
-                     element_type x7) :
-        m_value(_mm256_set_epi32(x7,x6,x5,x4,x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256((__m256i const *)ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_maskload_epi32(ptr, createMask(N));
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-        m_value = _mm256_i32gather_epi32(ptr,
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i32gather_epi32(_mm256_setzero_si256(),
-                                      ptr,
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 8;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = get(i);
-        }
-        return *this;
-      }
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi32(m_value, 0);
-          case 1: return _mm256_extract_epi32(m_value, 1);
-          case 2: return _mm256_extract_epi32(m_value, 2);
-          case 3: return _mm256_extract_epi32(m_value, 3);
-          case 4: return _mm256_extract_epi32(m_value, 4);
-          case 5: return _mm256_extract_epi32(m_value, 5);
-          case 6: return _mm256_extract_epi32(m_value, 6);
-          case 7: return _mm256_extract_epi32(m_value, 7);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi32(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi32(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi32(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi32(m_value, value, 3); break;
-          case 4: m_value = _mm256_insert_epi32(m_value, value, 4); break;
-          case 5: m_value = _mm256_insert_epi32(m_value, value, 5); break;
-          case 6: m_value = _mm256_insert_epi32(m_value, value, 6); break;
-          case 7: m_value = _mm256_insert_epi32(m_value, value, 7); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-
-        // the AVX2 epi32 multiply only multiplies the even elements
-        // and provides 64-bit results
-        // need to do some repacking to get this to work
-
-        // multiply 0, 2, 4, 6
-        auto prod_even = _mm256_mul_epi32(m_value, b.m_value);
-
-        // Swap 32-bit words
-        auto sh_a = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1));
-
-        auto sh_b = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(b.m_value), 0xB1));
-
-        // multiply 1, 3, 5, 7
-        auto prod_odd = _mm256_mul_epi32(sh_a, sh_b);
-
-        // Stitch prod_odd and prod_even back together
-        auto sh_odd = _mm256_castps_si256(
-                    _mm256_permute_ps(_mm256_castsi256_ps(prod_odd), 0xB1));
-
-        return self_type(_mm256_blend_epi32(prod_even, sh_odd, 0xAA));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi32(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(m_value), 0xB1) );
-        auto red1 = _mm256_add_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_castps_si256(
-            _mm256_permute_ps(_mm256_castsi256_ps(red1), 0x4E));
-        auto red2 = _mm256_add_epi32(red1, sh2);
-
-        return _mm256_extract_epi32(red2, 0) + _mm256_extract_epi32(red2, 4);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::min();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::max<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_max_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::max<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_max_epi32(red1, sh2);
-
-        return std::max<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm256_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        // Some simple cases
-        if(N <= 0 || N > 8){
-          return RAJA::operators::limits<int32_t>::max();
-        }
-        if(N == 1){
-          return get(0);
-        }
-
-        if(N == 2){
-          return std::min<element_type>(get(0), get(1));
-        }
-
-        // swap odd-even pairs and add
-        auto sh1 = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
-        auto red1 = _mm256_min_epi32(m_value, sh1);
-
-        if(N == 3){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
-        }
-        if(N == 4){
-          return std::min<element_type>(_mm256_extract_epi32(red1, 0), _mm256_extract_epi32(red1, 2));
-        }
-
-        // swap odd-even quads and add
-        auto sh2 = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
-        auto red2 = _mm256_min_epi32(red1, sh2);
-
-        return std::min<element_type>(_mm256_extract_epi32(red2, 0), _mm256_extract_epi32(red2, 4));
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm256_min_epi32(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::min();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::max<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_max_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::max<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_max_epi32(red1, sh2);
+
+    return std::max<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(8));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(8));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    // Some simple cases
+    if (N <= 0 || N > 8)
+    {
+      return RAJA::operators::limits<int32_t>::max();
+    }
+    if (N == 1)
+    {
+      return get(0);
+    }
+
+    if (N == 2)
+    {
+      return std::min<element_type>(get(0), get(1));
+    }
+
+    // swap odd-even pairs and add
+    auto sh1  = _mm256_permutevar8x32_epi32(m_value, createPermute1(N));
+    auto red1 = _mm256_min_epi32(m_value, sh1);
+
+    if (N == 3)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0), get(2));
+    }
+    if (N == 4)
+    {
+      return std::min<element_type>(_mm256_extract_epi32(red1, 0),
+                                    _mm256_extract_epi32(red1, 2));
+    }
+
+    // swap odd-even quads and add
+    auto sh2  = _mm256_permutevar8x32_epi32(red1, createPermute2(N));
+    auto red2 = _mm256_min_epi32(red1, sh2);
+
+    return std::min<element_type>(_mm256_extract_epi32(red2, 0),
+                                  _mm256_extract_epi32(red2, 4));
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_min_epi32(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
index 00eea542cd..be7886007d 100644
--- a/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/avx2_int64.hpp
@@ -28,524 +28,534 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx2_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+template<>
+class Register<int64_t, avx2_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx2_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
+
+  using register_policy = avx2_register;
+  using self_type       = Register<int64_t, avx2_register>;
+  using element_type    = int64_t;
+  using register_type   = __m256i;
+
+  using int_vector_type = Register<int64_t, avx2_register>;
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __m256i createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    return _mm256_set_epi64x(N >= 4 ? -1 : 0, N >= 3 ? -1 : 0, N >= 2 ? -1 : 0,
+                             N >= 1 ? -1 : 0);
+  }
+
+  RAJA_INLINE
+  __m256i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    return _mm256_set_epi64x(3 * stride, 2 * stride, stride, 0);
+  }
+
+  /*
+   * Use the packed-double permute function because there isn't one
+   * specifically for int64
+   *
+   * Just adds a bunch of casting, should be same cost
+   */
+  template<int perm>
+  RAJA_INLINE __m256i permute(__m256i x) const
+  {
+    return _mm256_castpd_si256(_mm256_permute_pd(_mm256_castsi256_pd(x), perm));
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 4;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+  Register() : m_value(_mm256_setzero_si256()) {}
+
+  /*!
+   * @brief Construct register with explicit values
+   */
+  RAJA_INLINE
+  Register(element_type x0, element_type x1, element_type x2, element_type x3)
+      : m_value(_mm256_set_epi64x(x3, x2, x1, x0))
+  {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(c), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  RAJA_INLINE
+  Register(element_type const& c) : m_value(_mm256_set1_epi64x(c)) {}
+
+  /*!
+   * @brief Returns underlying SIMD register.
+   */
+  RAJA_INLINE
+  constexpr register_type get_register() const { return m_value; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const*>(ptr));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    m_value = _mm256_castpd_si256(_mm256_maskload_pd(
+        reinterpret_cast<double const*>(ptr), createMask(N)));
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(int64_t const* ptr, camp::idx_t stride)
+  {
+    m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                                     createStridedOffsets(stride),
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_set1_epi64x(0), reinterpret_cast<long long const*>(ptr),
+        createStridedOffsets(stride), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx2_register>>;
-
-      using register_policy = avx2_register;
-      using self_type = Register<int64_t, avx2_register>;
-      using element_type = int64_t;
-      using register_type = __m256i;
-
-      using int_vector_type = Register<int64_t, avx2_register>;
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __m256i createMask(camp::idx_t N) const {
-        // Generate a mask
-        return  _mm256_set_epi64x(
-            N >= 4 ? -1 : 0,
-            N >= 3 ? -1 : 0,
-            N >= 2 ? -1 : 0,
-            N >= 1 ? -1 : 0);
-      }
-
-      RAJA_INLINE
-      __m256i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-        return  _mm256_set_epi64x(3*stride, 2*stride, stride, 0);
-      }
-
-      /*
-       * Use the packed-double permute function because there isn't one
-       * specifically for int64
-       *
-       * Just adds a bunch of casting, should be same cost
-       */
-      template<int perm>
-      RAJA_INLINE
-      __m256i permute(__m256i x) const {
-        return _mm256_castpd_si256(
-            _mm256_permute_pd(_mm256_castsi256_pd(x), perm));
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 4;
-
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      Register() : m_value(_mm256_setzero_si256()) {
-      }
-
-      /*!
-       * @brief Construct register with explicit values
-       */
-      RAJA_INLINE
-      Register(element_type x0,
-                     element_type x1,
-                     element_type x2,
-                     element_type x3) :
-        m_value(_mm256_set_epi64x(x3,x2,x1,x0))
-      {}
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(c), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-      RAJA_INLINE
-      Register(element_type const &c) : m_value(_mm256_set1_epi64x(c)) {}
-
-
-      /*!
-       * @brief Returns underlying SIMD register.
-       */
-      RAJA_INLINE
-      constexpr
-      register_type get_register() const {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        m_value = _mm256_castpd_si256(
-            _mm256_maskload_pd(reinterpret_cast<double const *>(ptr), createMask(N))
-        );
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(int64_t const *ptr, camp::idx_t stride){
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-        m_value = _mm256_mask_i64gather_epi64(_mm256_set1_epi64x(0),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      createStridedOffsets(stride),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_i64gather_epi64(reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
+    m_value =
+        _mm256_i64gather_epi64(reinterpret_cast<long long const*>(ptr),
+                               offsets.get_register(), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
 #ifdef RAJA_ENABLE_VECTOR_STATS
-          RAJA::tensor_stats::num_vector_load_strided_n ++;
+    RAJA::tensor_stats::num_vector_load_strided_n++;
 #endif
-        m_value = _mm256_mask_i64gather_epi64(_mm256_setzero_si256(),
-                                      reinterpret_cast<long long const *>(ptr),
-                                      offsets.get_register(),
-                                      createMask(N),
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-        for(camp::idx_t i = 0;i < 4;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-        for(camp::idx_t i = 0;i < N;++ i){
-          ptr[i*stride] = m_value[i];
-        }
-        return *this;
-      }
-
-
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: return _mm256_extract_epi64(m_value, 0);
-          case 1: return _mm256_extract_epi64(m_value, 1);
-          case 2: return _mm256_extract_epi64(m_value, 2);
-          case 3: return _mm256_extract_epi64(m_value, 3);
-        }
-        return 0;
-      }
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        // got to be a nicer way to do this!?!?
-        switch(i){
-          case 0: m_value = _mm256_insert_epi64(m_value, value, 0); break;
-          case 1: m_value = _mm256_insert_epi64(m_value, value, 1); break;
-          case 2: m_value = _mm256_insert_epi64(m_value, value, 2); break;
-          case 3: m_value = _mm256_insert_epi64(m_value, value, 3); break;
-        }
-
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm256_set1_epi64x(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm256_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm256_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        // AVX2 does not supply an int64_t multiply, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)*b.get(3),
-            get(2)*b.get(2),
-            get(1)*b.get(1),
-            get(0)*b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX2 does not supply an integer divide, so do it manually
-        return self_type(_mm256_set_epi64x(
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-
-        // swap pairs and add
-        auto sh1 = permute<0x5>(m_value);
-        auto red1 = _mm256_add_epi64(m_value, sh1);
-
-        // add lower and upper
-        return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red < v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red < v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red < v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::min();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red < v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red < v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red < v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) > a.get(3) ? get(3) : a.get(3),
-              get(2) > a.get(2) ? get(2) : a.get(2),
-              get(1) > a.get(1) ? get(1) : a.get(1),
-              get(0) > a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        auto v1 = get(1);
-        red = red > v1 ? v1 : red;
-
-        auto v2 = get(2);
-        red = red > v2 ? v2 : red;
-
-        auto v3 = get(3);
-        red = red > v3 ? v3 : red;
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        if(N <= 0 || N > 4){
-          return RAJA::operators::limits<int64_t>::max();
-        }
-
-        // AVX2 does not supply an 64bit integer max?!?
-        auto red = get(0);
-
-        if(N > 1){
-          auto v1 = get(1);
-          red = red > v1 ? v1 : red;
-        }
-        if(N > 2){
-          auto v2 = get(2);
-          red = red > v2 ? v2 : red;
-        }
-        if(N > 3){
-          auto v3 = get(3);
-          red = red > v3 ? v3 : red;
-        }
-
-        return red;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-          return self_type(_mm256_set_epi64x(
-              get(3) < a.get(3) ? get(3) : a.get(3),
-              get(2) < a.get(2) ? get(2) : a.get(2),
-              get(1) < a.get(1) ? get(1) : a.get(1),
-              get(0) < a.get(0) ? get(0) : a.get(0) ));
-        
-      }
-  };
-
-
-}   // namespace expt
+    m_value = _mm256_mask_i64gather_epi64(
+        _mm256_setzero_si256(), reinterpret_cast<long long const*>(ptr),
+        offsets.get_register(), createMask(N), sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    _mm256_maskstore_epi64(reinterpret_cast<long long*>(ptr), createMask(N),
+                           m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    for (camp::idx_t i = 0; i < 4; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    for (camp::idx_t i = 0; i < N; ++i)
+    {
+      ptr[i * stride] = m_value[i];
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        return _mm256_extract_epi64(m_value, 0);
+      case 1:
+        return _mm256_extract_epi64(m_value, 1);
+      case 2:
+        return _mm256_extract_epi64(m_value, 2);
+      case 3:
+        return _mm256_extract_epi64(m_value, 3);
+    }
+    return 0;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    // got to be a nicer way to do this!?!?
+    switch (i)
+    {
+      case 0:
+        m_value = _mm256_insert_epi64(m_value, value, 0);
+        break;
+      case 1:
+        m_value = _mm256_insert_epi64(m_value, value, 1);
+        break;
+      case 2:
+        m_value = _mm256_insert_epi64(m_value, value, 2);
+        break;
+      case 3:
+        m_value = _mm256_insert_epi64(m_value, value, 3);
+        break;
+    }
+
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm256_set1_epi64x(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm256_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm256_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    // AVX2 does not supply an int64_t multiply, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) * b.get(3), get(2) * b.get(2),
+                                       get(1) * b.get(1), get(0) * b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(get(3) / b.get(3), get(2) / b.get(2),
+                                       get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX2 does not supply an integer divide, so do it manually
+    return self_type(_mm256_set_epi64x(
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const
+  {
+
+    // swap pairs and add
+    auto sh1  = permute<0x5>(m_value);
+    auto red1 = _mm256_add_epi64(m_value, sh1);
+
+    // add lower and upper
+    return _mm256_extract_epi64(red1, 0) + _mm256_extract_epi64(red1, 2);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red < v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red < v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red < v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::min();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red < v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red < v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red < v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) > a.get(3) ? get(3) : a.get(3),
+                                       get(2) > a.get(2) ? get(2) : a.get(2),
+                                       get(1) > a.get(1) ? get(1) : a.get(1),
+                                       get(0) > a.get(0) ? get(0) : a.get(0)));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const
+  {
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    auto v1 = get(1);
+    red     = red > v1 ? v1 : red;
+
+    auto v2 = get(2);
+    red     = red > v2 ? v2 : red;
+
+    auto v3 = get(3);
+    red     = red > v3 ? v3 : red;
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    if (N <= 0 || N > 4)
+    {
+      return RAJA::operators::limits<int64_t>::max();
+    }
+
+    // AVX2 does not supply an 64bit integer max?!?
+    auto red = get(0);
+
+    if (N > 1)
+    {
+      auto v1 = get(1);
+      red     = red > v1 ? v1 : red;
+    }
+    if (N > 2)
+    {
+      auto v2 = get(2);
+      red     = red > v2 ? v2 : red;
+    }
+    if (N > 3)
+    {
+      auto v3 = get(3);
+      red     = red > v3 ? v3 : red;
+    }
+
+    return red;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm256_set_epi64x(get(3) < a.get(3) ? get(3) : a.get(3),
+                                       get(2) < a.get(2) ? get(2) : a.get(2),
+                                       get(1) < a.get(1) ? get(1) : a.get(1),
+                                       get(0) < a.get(0) ? get(0) : a.get(0)));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX2__
+#endif  //__AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx2/traits.hpp b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
index e95c661335..1265f39d52 100644
--- a/include/RAJA/policy/tensor/arch/avx2/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx2/traits.hpp
@@ -20,56 +20,60 @@
 #ifndef RAJA_policy_tensor_arch_avx2_traits_HPP
 #define RAJA_policy_tensor_arch_avx2_traits_HPP
 
-
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int32_t;
-  };
-
-  template<>
-  struct RegisterTraits<RAJA::expt::avx2_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx2_register;
-      static constexpr camp::idx_t s_num_bits = 256;
-      static constexpr camp::idx_t s_num_elem = 4;
-      using int_element_type = int64_t;
-  };
-
-} // namespace intenral
-} // namespace expt
-} // namespace RAJA
-
-
-#endif // guard
-
-
-
-#endif // __AVX2__
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+
+template<>
+struct RegisterTraits<RAJA::expt::avx2_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template<>
+struct RegisterTraits<RAJA::expt::avx2_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+template<>
+struct RegisterTraits<RAJA::expt::avx2_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int32_t;
+};
+
+template<>
+struct RegisterTraits<RAJA::expt::avx2_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx2_register;
+  static constexpr camp::idx_t s_num_bits = 256;
+  static constexpr camp::idx_t s_num_elem = 4;
+  using int_element_type                  = int64_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
+
+
+#endif  // guard
+
+
+#endif  // __AVX2__
diff --git a/include/RAJA/policy/tensor/arch/avx512.hpp b/include/RAJA/policy/tensor/arch/avx512.hpp
index 597563da35..71d0212c5e 100644
--- a/include/RAJA/policy/tensor/arch/avx512.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512.hpp
@@ -18,11 +18,11 @@
 // Check if the base AVX512 instructions are present
 #ifdef __AVX512F__
 
-#include<RAJA/policy/tensor/arch/avx512/traits.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
-#include<RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
+#include <RAJA/policy/tensor/arch/avx512/traits.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int32.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_int64.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_float.hpp>
+#include <RAJA/policy/tensor/arch/avx512/avx512_double.hpp>
 
 
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
index a7b7ebaafa..f6acc2ac24 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_double.hpp
@@ -28,366 +28,380 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<>
-  class Register<double, avx512_register> :
-    public internal::expt::RegisterBase<Register<double, avx512_register>>
+template<>
+class Register<double, avx512_register>
+    : public internal::expt::RegisterBase<Register<double, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<double, avx512_register>>;
+
+
+  using register_policy = avx512_register;
+  using self_type       = Register<double, avx512_register>;
+  using element_type    = double;
+  using register_type   = __m512d;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+      case 0:
+        return __mmask8(0x00);
+      case 1:
+        return __mmask8(0x01);
+      case 2:
+        return __mmask8(0x03);
+      case 3:
+        return __mmask8(0x07);
+      case 4:
+        return __mmask8(0x0F);
+      case 5:
+        return __mmask8(0x1F);
+      case 6:
+        return __mmask8(0x3F);
+      case 7:
+        return __mmask8(0x7F);
+      case 8:
+        return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_pd()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_pd(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_pd(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_pd(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_pd(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_pd(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_pd(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_pd(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<double, avx512_register>>;
-
-
-      using register_policy = avx512_register;
-      using self_type = Register<double, avx512_register>;
-      using element_type = double;
-      using register_type = __m512d;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_pd()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_pd(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_pd(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_pd(_mm512_setzero_pd(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_pd(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_pd(_mm512_setzero_pd(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_pd(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_pd(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_pd(ptr, 
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_pd(ptr, 
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_pd(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_pd(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
-      }
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_pd(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_pd(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_pd(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_pd(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_pd(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_pd(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_pd(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_pd(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_pd(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_pd(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_pd(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_pd(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
index 84cb034a56..f6712a3e4f 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_float.hpp
@@ -28,372 +28,395 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<float, avx512_register> :
-    public internal::expt::RegisterBase<Register<float, avx512_register>>
+template<>
+class Register<float, avx512_register>
+    : public internal::expt::RegisterBase<Register<float, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<float, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<float, avx512_register>;
+  using element_type    = float;
+  using register_type   = __m512;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+      case 0:
+        return __mmask16(0x0000);
+      case 1:
+        return __mmask16(0x0001);
+      case 2:
+        return __mmask16(0x0003);
+      case 3:
+        return __mmask16(0x0007);
+      case 4:
+        return __mmask16(0x000F);
+      case 5:
+        return __mmask16(0x001F);
+      case 6:
+        return __mmask16(0x003F);
+      case 7:
+        return __mmask16(0x007F);
+      case 8:
+        return __mmask16(0x00FF);
+      case 9:
+        return __mmask16(0x01FF);
+      case 10:
+        return __mmask16(0x03FF);
+      case 11:
+        return __mmask16(0x07FF);
+      case 12:
+        return __mmask16(0x0FFF);
+      case 13:
+        return __mmask16(0x1FFF);
+      case 14:
+        return __mmask16(0x3FFF);
+      case 15:
+        return __mmask16(0x7FFF);
+      case 16:
+        return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_ps()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_ps(c)) {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+    m_value = _mm512_loadu_ps(ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_ps(createStridedOffsets(stride), ptr,
+                                  sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(), createMask(N),
+                                       createStridedOffsets(stride), ptr,
+                                       sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+    _mm512_storeu_ps(ptr, m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_ps(ptr, createStridedOffsets(stride), m_value,
+                         sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_ps(ptr, createMask(N), createStridedOffsets(stride),
+                              m_value, sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_ps(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<float, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<float, avx512_register>;
-      using element_type = float;
-      using register_type = __m512;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_ps()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_ps(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        m_value = _mm512_loadu_ps(ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_ps(_mm512_setzero_ps(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_ps(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_ps(_mm512_setzero_ps(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        _mm512_storeu_ps(ptr, m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_ps(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_ps(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_ps(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_ps(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mul_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(_mm512_div_ps(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
-      }
+    return self_type(_mm512_add_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mul_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(_mm512_div_ps(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    return self_type(_mm512_maskz_div_ps(createMask(N), m_value, b.m_value));
+  }
 
 // only use FMA's if the compiler has them turned on
 #ifdef __FMA__
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
-      }
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
-      }
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmadd_ps(m_value, b.m_value, c.m_value));
+  }
+
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(_mm512_fmsub_ps(m_value, b.m_value, c.m_value));
+  }
 #endif
 
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_ps(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_ps(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_ps(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_ps(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_ps(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_ps(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_ps(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_ps(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
index 021ca90fbe..66283de39c 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int32.hpp
@@ -28,424 +28,445 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int32_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+template<>
+class Register<int32_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int32_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int32_t, avx512_register>;
+  using element_type    = int32_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int32_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask16 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+      case 0:
+        return __mmask16(0x0000);
+      case 1:
+        return __mmask16(0x0001);
+      case 2:
+        return __mmask16(0x0003);
+      case 3:
+        return __mmask16(0x0007);
+      case 4:
+        return __mmask16(0x000F);
+      case 5:
+        return __mmask16(0x001F);
+      case 6:
+        return __mmask16(0x003F);
+      case 7:
+        return __mmask16(0x007F);
+      case 8:
+        return __mmask16(0x00FF);
+      case 9:
+        return __mmask16(0x01FF);
+      case 10:
+        return __mmask16(0x03FF);
+      case 11:
+        return __mmask16(0x07FF);
+      case 12:
+        return __mmask16(0x0FFF);
+      case 13:
+        return __mmask16(0x1FFF);
+      case 14:
+        return __mmask16(0x3FFF);
+      case 15:
+        return __mmask16(0x7FFF);
+      case 16:
+        return __mmask16(0xFFFF);
+    }
+    return __mmask16(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi32(stride);
+    auto vseq =
+        _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi32(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 16;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi32(c))
+  {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    m_value = _mm512_loadu_si512(ptr);
+#else
+    m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int32_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int32_t, avx512_register>;
-      using element_type = int32_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int32_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask16 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0:  return __mmask16(0x0000);
-					case 1:  return __mmask16(0x0001);
-					case 2:  return __mmask16(0x0003);
-					case 3:  return __mmask16(0x0007);
-					case 4:  return __mmask16(0x000F);
-					case 5:  return __mmask16(0x001F);
-					case 6:  return __mmask16(0x003F);
-					case 7:  return __mmask16(0x007F);
-					case 8:  return __mmask16(0x00FF);
-          case 9:  return __mmask16(0x01FF);
-          case 10: return __mmask16(0x03FF);
-          case 11: return __mmask16(0x07FF);
-          case 12: return __mmask16(0x0FFF);
-          case 13: return __mmask16(0x1FFF);
-          case 14: return __mmask16(0x3FFF);
-          case 15: return __mmask16(0x7FFF);
-          case 16: return __mmask16(0xFFFF);
-				}
-				return __mmask16(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi32(stride);
-				auto vseq = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi32(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 16;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi32(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        m_value = _mm512_loadu_si512(ptr);
-        #else
-        m_value = _mm512_loadu_epi32(ptr);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i32gather_epi32(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
-        _mm512_storeu_si512(ptr, m_value);
-        #else
-        _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i32scatter_epi32(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i32scatter_epi32(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {
-        // GNU 7-10 are missing this instruction.
-        #if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
-        #define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
-        #endif
-
-				switch(i){	
-					case 0: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
-					case 1: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
-					case 2: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
-					case 3: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
-					case 4: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
-					case 5: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
-					case 6: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
-					case 7: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
-					case 8: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
-					case 9: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
-					case 10: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
-					case 11: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
-					case 12: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
-					case 13: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
-					case 14: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
-					case 15: return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
-				}
-				return 0;
-			}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-				m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi32(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi32(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            get(15)/b.get(15),
-            get(14)/b.get(14),
-            get(13)/b.get(13),
-            get(12)/b.get(12),
-            get(11)/b.get(11),
-            get(10)/b.get(10),
-            get(9)/b.get(9),
-            get(8)/b.get(8),
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi32(
-            N >= 16 ? get(15)/b.get(15) : 0,
-            N >= 15 ? get(14)/b.get(14) : 0,
-            N >= 14 ? get(13)/b.get(13) : 0,
-            N >= 13 ? get(12)/b.get(12) : 0,
-            N >= 12 ? get(11)/b.get(11) : 0,
-            N >= 11 ? get(10)/b.get(10) : 0,
-            N >= 10 ? get(9)/b.get(9) : 0,
-            N >= 9 ? get(8)/b.get(8) : 0,
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi32(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi32(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi32(m_value, a.m_value));
-      }
-  };
-
-}   // namespace expt
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi32(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i32gather_epi32(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i32gather_epi32(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 9))
+    _mm512_storeu_si512(ptr, m_value);
+#else
+    _mm512_storeu_epi32(ptr, m_value);  // GNU 7-9 are missing this instruction.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi32(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i32scatter_epi32(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i32scatter_epi32(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const
+  {
+// GNU 7-10 are missing this instruction.
+#if defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))
+#define _mm512_cvtsi512_si32(x) _mm_cvtsi128_si32(_mm512_castsi512_si128(x))
+#endif
+
+    switch (i)
+    {
+      case 0:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 0));
+      case 1:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 1));
+      case 2:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 2));
+      case 3:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 3));
+      case 4:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 4));
+      case 5:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 5));
+      case 6:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 6));
+      case 7:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 7));
+      case 8:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 8));
+      case 9:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 9));
+      case 10:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 10));
+      case 11:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 11));
+      case 12:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 12));
+      case 13:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 13));
+      case 14:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 14));
+      case 15:
+        return _mm512_cvtsi512_si32(_mm512_alignr_epi32(m_value, m_value, 15));
+    }
+    return 0;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value = _mm512_mask_set1_epi32(m_value, 1 << i, value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi32(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi32(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        get(15) / b.get(15), get(14) / b.get(14), get(13) / b.get(13),
+        get(12) / b.get(12), get(11) / b.get(11), get(10) / b.get(10),
+        get(9) / b.get(9), get(8) / b.get(8), get(7) / b.get(7),
+        get(6) / b.get(6), get(5) / b.get(5), get(4) / b.get(4),
+        get(3) / b.get(3), get(2) / b.get(2), get(1) / b.get(1),
+        get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi32(
+        N >= 16 ? get(15) / b.get(15) : 0, N >= 15 ? get(14) / b.get(14) : 0,
+        N >= 14 ? get(13) / b.get(13) : 0, N >= 13 ? get(12) / b.get(12) : 0,
+        N >= 12 ? get(11) / b.get(11) : 0, N >= 11 ? get(10) / b.get(10) : 0,
+        N >= 10 ? get(9) / b.get(9) : 0, N >= 9 ? get(8) / b.get(8) : 0,
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi32(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi32(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi32(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi32(m_value, a.m_value));
+  }
+};
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
index 17f929c607..6fbd7cd485 100644
--- a/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/avx512_int64.hpp
@@ -28,378 +28,391 @@
 #include <immintrin.h>
 #include <cmath>
 
-
 namespace RAJA
 {
 namespace expt
 {
-  template<>
-  class Register<int64_t, avx512_register> :
-    public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+template<>
+class Register<int64_t, avx512_register>
+    : public internal::expt::RegisterBase<Register<int64_t, avx512_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
+
+  using register_policy = avx512_register;
+  using self_type       = Register<int64_t, avx512_register>;
+  using element_type    = int64_t;
+  using register_type   = __m512i;
+
+  using int_vector_type = Register<int64_t, avx512_register>;
+
+
+private:
+  register_type m_value;
+
+  RAJA_INLINE
+  __mmask8 createMask(camp::idx_t N) const
+  {
+    // Generate a mask
+    switch (N)
+    {
+      case 0:
+        return __mmask8(0x00);
+      case 1:
+        return __mmask8(0x01);
+      case 2:
+        return __mmask8(0x03);
+      case 3:
+        return __mmask8(0x07);
+      case 4:
+        return __mmask8(0x0F);
+      case 5:
+        return __mmask8(0x1F);
+      case 6:
+        return __mmask8(0x3F);
+      case 7:
+        return __mmask8(0x7F);
+      case 8:
+        return __mmask8(0xFF);
+    }
+    return __mmask8(0);
+  }
+
+  RAJA_INLINE
+  __m512i createStridedOffsets(camp::idx_t stride) const
+  {
+    // Generate a strided offset list
+    auto vstride = _mm512_set1_epi64(stride);
+    auto vseq    = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
+    return _mm512_mullo_epi64(vstride, vseq);
+  }
+
+public:
+  static constexpr camp::idx_t s_num_elem = 8;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register() : base_type(), m_value(_mm512_setzero_epi32()) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_INLINE
+  explicit Register(register_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+  Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Construct from scalar.
+   * Sets all elements to same value (broadcast).
+   */
+  // AVX512F
+  RAJA_INLINE
+  Register(element_type const& c) : base_type(), m_value(_mm512_set1_epi64(c))
+  {}
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    m_value = _mm512_maskz_loadu_epi64(
+        ~0,
+        ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
+#else
+    m_value =
+        _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as
+                                  // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<int64_t, avx512_register>>;
-
-      using register_policy = avx512_register;
-      using self_type = Register<int64_t, avx512_register>;
-      using element_type = int64_t;
-      using register_type = __m512i;
-
-      using int_vector_type = Register<int64_t, avx512_register>;
-
-
-    private:
-      register_type m_value;
-
-      RAJA_INLINE
-      __mmask8 createMask(camp::idx_t N) const {
-        // Generate a mask
-				switch(N){
-					case 0: return __mmask8(0x00);
-					case 1: return __mmask8(0x01);
-					case 2: return __mmask8(0x03);
-					case 3: return __mmask8(0x07);
-					case 4: return __mmask8(0x0F);
-					case 5: return __mmask8(0x1F);
-					case 6: return __mmask8(0x3F);
-					case 7: return __mmask8(0x7F);
-					case 8: return __mmask8(0xFF);
-				}
-				return __mmask8(0);
-      }
-
-      RAJA_INLINE
-      __m512i createStridedOffsets(camp::idx_t stride) const {
-        // Generate a strided offset list
-				auto vstride = _mm512_set1_epi64(stride);
-				auto vseq = _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0);
-				return _mm512_mullo_epi64(vstride, vseq);
-      }
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 8;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register() : base_type(), m_value(_mm512_setzero_epi32()) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_INLINE
-      explicit Register(register_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Construct from scalar.
-       * Sets all elements to same value (broadcast).
-       */
-			// AVX512F
-      RAJA_INLINE
-      Register(element_type const &c) : base_type(), m_value(_mm512_set1_epi64(c)) {}
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-			  // AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        m_value = _mm512_maskz_loadu_epi64(~0, ptr);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        m_value = _mm512_loadu_epi64(ptr);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-			  // AVX512F
-        m_value = _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t stride){
-			  // AVX512F
-        m_value = _mm512_i64gather_epi64(createStridedOffsets(stride),
-				                              ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t stride, camp::idx_t N){
-				// AVX512F
-        m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(),
-                                      createMask(N),
-                                      createStridedOffsets(stride),
-                                      ptr,
-                                      sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-				// AVX512F
-        #if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) || \
-            (!defined(SYCL_LANGUAGE_VERSION) && defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
-        _mm512_mask_storeu_epi64(ptr, ~0, m_value);  // May cause slowdown due to looping over 8 bytes, one at a time.
-        #else
-        _mm512_storeu_epi64(ptr, m_value);  // GNU 7-10 are missing this instruction, as is icpx as of version 2022.2.
-        #endif
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-				// AVX512F
-        _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t stride) const{
-				// AVX512F
-				_mm512_i64scatter_epi64(ptr,
-				                     createStridedOffsets(stride),
-														 m_value,
-														 sizeof(element_type));
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t stride, camp::idx_t N) const{
-				// AVX512F
-				_mm512_mask_i64scatter_epi64(ptr,
-                           				createMask(N),
-				                          createStridedOffsets(stride),
-																	m_value,
-														      sizeof(element_type));
-        return *this;
-      }
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      RAJA_INLINE
-      element_type get(camp::idx_t i) const
-      {return m_value[i];}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      self_type &set(element_type value, camp::idx_t i)
-      {
-        m_value[i] = value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &value){
-        m_value =  _mm512_set1_epi64(value);
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(_mm512_add_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(_mm512_sub_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(_mm512_mullo_epi64(m_value, b.m_value));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            get(7)/b.get(7),
-            get(6)/b.get(6),
-            get(5)/b.get(5),
-            get(4)/b.get(4),
-            get(3)/b.get(3),
-            get(2)/b.get(2),
-            get(1)/b.get(1),
-            get(0)/b.get(0)
-            ));
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, camp::idx_t N ) const {
-        // AVX512 does not supply an integer divide, so do it manually
-        return self_type(_mm512_set_epi64(
-            N >= 8 ? get(7)/b.get(7) : 0,
-            N >= 7 ? get(6)/b.get(6) : 0,
-            N >= 6 ? get(5)/b.get(5) : 0,
-            N >= 5 ? get(4)/b.get(4) : 0,
-            N >= 4 ? get(3)/b.get(3) : 0,
-            N >= 3 ? get(2)/b.get(2) : 0,
-            N >= 2 ? get(1)/b.get(1) : 0,
-            N >= 1 ? get(0)/b.get(0) : 0
-            ));
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      element_type sum() const
-      {
-				return _mm512_reduce_add_epi64(m_value);
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max() const
-      {
-        return _mm512_reduce_max_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(_mm512_max_epi64(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min() const
-      {
-        return _mm512_reduce_min_epi64(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-				return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(_mm512_min_epi64(m_value, a.m_value));
-      }
-  };
-
-
-}   // namespace expt
+    // AVX512F
+    m_value =
+        _mm512_mask_loadu_epi64(_mm512_setzero_epi32(), createMask(N), ptr);
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t stride)
+  {
+    // AVX512F
+    m_value = _mm512_i64gather_epi64(createStridedOffsets(stride), ptr,
+                                     sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr,
+                            camp::idx_t stride,
+                            camp::idx_t N)
+  {
+    // AVX512F
+    m_value = _mm512_mask_i64gather_epi64(_mm512_setzero_epi32(), createMask(N),
+                                          createStridedOffsets(stride), ptr,
+                                          sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    // AVX512F
+#if (defined(__GNUC__) && ((__GNUC__ >= 7) && (__GNUC__ <= 10))) ||            \
+    (!defined(SYCL_LANGUAGE_VERSION) &&                                        \
+     defined(__INTEL_LLVM_COMPILER))  // Check for oneapi's icpx.
+    _mm512_mask_storeu_epi64(ptr, ~0,
+                             m_value);  // May cause slowdown due to looping
+                                        // over 8 bytes, one at a time.
+#else
+    _mm512_storeu_epi64(ptr,
+                        m_value);  // GNU 7-10 are missing this instruction, as
+                                   // is icpx as of version 2022.2.
+#endif
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_storeu_epi64(ptr, createMask(N), m_value);
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t stride) const
+  {
+    // AVX512F
+    _mm512_i64scatter_epi64(ptr, createStridedOffsets(stride), m_value,
+                            sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t stride,
+                                   camp::idx_t N) const
+  {
+    // AVX512F
+    _mm512_mask_i64scatter_epi64(ptr, createMask(N),
+                                 createStridedOffsets(stride), m_value,
+                                 sizeof(element_type));
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  RAJA_INLINE
+  element_type get(camp::idx_t i) const { return m_value[i]; }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+  self_type& set(element_type value, camp::idx_t i)
+  {
+    m_value[i] = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& value)
+  {
+    m_value = _mm512_set1_epi64(value);
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(_mm512_add_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(_mm512_sub_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(_mm512_mullo_epi64(m_value, b.m_value));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(get(7) / b.get(7), get(6) / b.get(6),
+                                      get(5) / b.get(5), get(4) / b.get(4),
+                                      get(3) / b.get(3), get(2) / b.get(2),
+                                      get(1) / b.get(1), get(0) / b.get(0)));
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, camp::idx_t N) const
+  {
+    // AVX512 does not supply an integer divide, so do it manually
+    return self_type(_mm512_set_epi64(
+        N >= 8 ? get(7) / b.get(7) : 0, N >= 7 ? get(6) / b.get(6) : 0,
+        N >= 6 ? get(5) / b.get(5) : 0, N >= 5 ? get(4) / b.get(4) : 0,
+        N >= 4 ? get(3) / b.get(3) : 0, N >= 3 ? get(2) / b.get(2) : 0,
+        N >= 2 ? get(1) / b.get(1) : 0, N >= 1 ? get(0) / b.get(0) : 0));
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+  element_type sum() const { return _mm512_reduce_add_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max() const { return _mm512_reduce_max_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_max_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(_mm512_max_epi64(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min() const { return _mm512_reduce_min_epi64(m_value); }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return _mm512_mask_reduce_min_epi64(createMask(N), m_value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+  self_type vmin(self_type a) const
+  {
+    return self_type(_mm512_min_epi64(m_value, a.m_value));
+  }
+};
+
+
+}  // namespace expt
 
 }  // namespace RAJA
 
 
 #endif
 
-#endif //__AVX512F__
+#endif  //__AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/avx512/traits.hpp b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
index b2b5cf6731..33689b2095 100644
--- a/include/RAJA/policy/tensor/arch/avx512/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/avx512/traits.hpp
@@ -21,53 +21,59 @@
 #ifndef RAJA_policy_tensor_arch_avx512_traits_HPP
 #define RAJA_policy_tensor_arch_avx512_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx512_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx512_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 16;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx512_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 16;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::avx512_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::avx512_register;
-      static constexpr camp::idx_t s_num_bits = 512;
-      static constexpr camp::idx_t s_num_elem = 8;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::avx512_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::avx512_register;
+  static constexpr camp::idx_t s_num_bits = 512;
+  static constexpr camp::idx_t s_num_elem = 8;
+  using int_element_type                  = int64_t;
+};
 
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
-#endif // guard
+#endif  // guard
 
 
-
-#endif // __AVX512F__
+#endif  // __AVX512F__
diff --git a/include/RAJA/policy/tensor/arch/cuda.hpp b/include/RAJA/policy/tensor/arch/cuda.hpp
index a840c63d85..cfda807e68 100644
--- a/include/RAJA/policy/tensor/arch/cuda.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_cuda_HPP
 #define RAJA_policy_tensor_arch_cuda_HPP
 
-#include<RAJA/policy/tensor/arch/cuda/traits.hpp>
-#include<RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
+#include <RAJA/policy/tensor/arch/cuda/traits.hpp>
+#include <RAJA/policy/tensor/arch/cuda/cuda_warp.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
index e23eb92bed..4ea3aa6fd1 100644
--- a/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/cuda_warp.hpp
@@ -29,982 +29,1045 @@
 #ifndef RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 #define RAJA_policy_tensor_arch_cuda_cuda_warp_register_HPP
 
-
-
 namespace RAJA
 {
 namespace expt
 {
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, cuda_warp_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>
-  {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
-
-      using register_policy = cuda_warp_register;
-      using self_type = Register<ELEMENT_TYPE, cuda_warp_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, cuda_warp_register>;
-
-
-		private:
-      element_type m_value;
+template<typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, cuda_warp_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, cuda_warp_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, cuda_warp_register>>;
+
+  using register_policy = cuda_warp_register;
+  using self_type       = Register<ELEMENT_TYPE, cuda_warp_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, cuda_warp_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 32;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  RAJA_INLINE
 
-		public:
-
-      static constexpr int s_num_elem = 32;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return  __shfl_sync(0xffffffff, m_value, i, 32);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter_n(element_type* ptr,
+                                                     T2 const& offsets,
+                                                     camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return __shfl_sync(0xffffffff, m_value, i, 32);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
+  {
+    self_type x;
+    x.m_value = __shfl_sync(0xffffffff, m_value, i, 32);
+    return x;
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(5-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 5-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = __shfl_sync(0xffffffff, x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // CUDA
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::cuda::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (5 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 5 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = __shfl_sync(0xffffffff, x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = __shfl_sync(0xffffffff, x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+
+    return result;
+  }
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = __shfl_sync(0xffffffff, m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // CUDA
diff --git a/include/RAJA/policy/tensor/arch/cuda/traits.hpp b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
index 032517677c..c076a19d58 100644
--- a/include/RAJA/policy/tensor/arch/cuda/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/cuda/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_cuda_traits_HPP
 #define RAJA_policy_tensor_arch_cuda_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::cuda_warp_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::cuda_warp_register;
-      static constexpr camp::idx_t s_num_elem = 32;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template<typename T>
+struct RegisterTraits<RAJA::expt::cuda_warp_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::cuda_warp_register;
+  static constexpr camp::idx_t s_num_elem = 32;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip.hpp b/include/RAJA/policy/tensor/arch/hip.hpp
index 6e76772a29..3ddf27e39c 100644
--- a/include/RAJA/policy/tensor/arch/hip.hpp
+++ b/include/RAJA/policy/tensor/arch/hip.hpp
@@ -21,11 +21,11 @@
 #ifndef RAJA_policy_tensor_arch_hip_HPP
 #define RAJA_policy_tensor_arch_hip_HPP
 
-#include<RAJA/policy/tensor/arch/hip/traits.hpp>
-#include<RAJA/policy/tensor/arch/hip/hip_wave.hpp>
+#include <RAJA/policy/tensor/arch/hip/traits.hpp>
+#include <RAJA/policy/tensor/arch/hip/hip_wave.hpp>
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_CUDA
+#endif  // RAJA_ENABLE_CUDA
diff --git a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
index 74bbc2f077..ad0185111d 100644
--- a/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/hip_wave.hpp
@@ -29,984 +29,1047 @@
 #ifndef RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 #define RAJA_policy_tensor_arch_hip_hip_wave_register_HPP
 
-
-
 namespace RAJA
 {
 namespace expt
 {
 
 
-  template<typename ELEMENT_TYPE>
-  class Register<ELEMENT_TYPE, hip_wave_register> :
-    public internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>
+template<typename ELEMENT_TYPE>
+class Register<ELEMENT_TYPE, hip_wave_register>
+    : public internal::expt::RegisterBase<
+          Register<ELEMENT_TYPE, hip_wave_register>>
+{
+public:
+  using base_type =
+      internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
+
+  using register_policy = hip_wave_register;
+  using self_type       = Register<ELEMENT_TYPE, hip_wave_register>;
+  using element_type    = ELEMENT_TYPE;
+  using register_type   = ELEMENT_TYPE;
+
+  using int_vector_type = Register<int64_t, hip_wave_register>;
+
+
+private:
+  element_type m_value;
+
+public:
+  static constexpr int s_num_elem = 64;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from raw value
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register(element_type c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment operator
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& operator=(element_type c)
+  {
+    m_value = c;
+    return *this;
+  }
+
+  /*!
+   * @brief Gets our warp lane
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  constexpr static int get_lane() { return threadIdx.x; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  constexpr element_type const& get_raw_value() const { return m_value; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  element_type& get_raw_value() { return m_value; }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  static constexpr bool is_root() { return get_lane() == 0; }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_packed(element_type const* ptr)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_packed_n(element_type const* ptr, int N)
+  {
+    auto lane = get_lane();
+    if (lane < N)
+    {
+      m_value = ptr[lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_strided(element_type const* ptr, int stride)
+  {
+
+    auto lane = get_lane();
+
+    m_value = ptr[stride * lane];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& load_strided_n(element_type const* ptr, int stride, int N)
+  {
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      m_value = ptr[stride * lane];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get_raw_value()];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (get_lane() < N)
+    {
+      m_value = ptr[offsets.get_raw_value()];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays.
+   *
+   * The default operation combines the s_segmented_offsets and gather
+   * operations.
+   *
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& segmented_load(element_type const* ptr,
+                            camp::idx_t segbits,
+                            camp::idx_t stride_inner,
+                            camp::idx_t stride_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    m_value = ptr[seg * stride_outer + i * stride_inner];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented load operation used for loading sub-matrices
+   * from larger arrays where we load partial segments.
+   *
+   *
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& segmented_load_nm(element_type const* ptr,
+                               camp::idx_t segbits,
+                               camp::idx_t stride_inner,
+                               camp::idx_t stride_outer,
+                               camp::idx_t num_inner,
+                               camp::idx_t num_outer)
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      m_value = element_type(0);
+    }
+    else
+    {
+      m_value = ptr[seg * stride_outer + i * stride_inner];
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_packed(element_type* ptr) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_packed_n(element_type* ptr, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_strided(element_type* ptr, int stride) const
+  {
+
+    auto lane = get_lane();
+
+    ptr[lane * stride] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type const& store_strided_n(element_type* ptr, int stride, int N) const
+  {
+
+    auto lane = get_lane();
+
+    if (lane < N)
+    {
+      ptr[lane * stride] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter(element_type* ptr,
+                                                   T2 const& offsets) const
+  {
+
+    ptr[offsets.get_raw_value()] = m_value;
+
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  template<typename T2>
+  RAJA_DEVICE RAJA_INLINE self_type const& scatter_n(element_type* ptr,
+                                                     T2 const& offsets,
+                                                     camp::idx_t N) const
+  {
+    if (get_lane() < N)
+    {
+      ptr[offsets.get_raw_value()] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays.
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type const& segmented_store(element_type* ptr,
+                                   camp::idx_t segbits,
+                                   camp::idx_t stride_inner,
+                                   camp::idx_t stride_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    ptr[seg * stride_outer + i * stride_inner] = m_value;
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic segmented store operation used for storing sub-matrices
+   * to larger arrays where we store partial segments.
+   *
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type const& segmented_store_nm(element_type* ptr,
+                                      camp::idx_t segbits,
+                                      camp::idx_t stride_inner,
+                                      camp::idx_t stride_outer,
+                                      camp::idx_t num_inner,
+                                      camp::idx_t num_outer) const
+  {
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      ptr[seg * stride_outer + i * stride_inner] = m_value;
+    }
+
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_DEVICE element_type get(int i) const
+  {
+    return hip::impl::shfl_sync(m_value, i);
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type& set(element_type value, int i)
+  {
+    auto lane = get_lane();
+    if (lane == i)
+    {
+      m_value = value;
+    }
+    return *this;
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  /*!
+   * @brief Extracts a scalar value and broadcasts to a new register
+   */
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type get_and_broadcast(int i) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<ELEMENT_TYPE, hip_wave_register>>;
-
-      using register_policy = hip_wave_register;
-      using self_type = Register<ELEMENT_TYPE, hip_wave_register>;
-      using element_type = ELEMENT_TYPE;
-      using register_type = ELEMENT_TYPE;
-
-      using int_vector_type = Register<int64_t, hip_wave_register>;
-
+    self_type x;
+    x.m_value = hip::impl::shfl_sync(m_value, i, 32);
+    return x;
+  }
 
-		private:
-      element_type m_value;
+  RAJA_DEVICE
 
-		public:
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_DEVICE
 
-      static constexpr int s_num_elem = 64;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register() : base_type(), m_value(0) {
-
-      }
-
-
-      /*!
-       * @brief Copy constructor from raw value
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(element_type c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment operator
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &operator=(element_type c){
-        m_value = c;
-        return *this;
-      }
-
-      /*!
-       * @brief Gets our warp lane
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      constexpr
-      static
-      int get_lane() {
-        return threadIdx.x;
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type const &get_raw_value() const {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      element_type &get_raw_value() {
-        return m_value;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      static
-      constexpr
-      bool is_root() {
-        return get_lane() == 0;
-      }
-
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed(element_type const *ptr){
-
-        auto lane = get_lane();
-
-        m_value = ptr[lane];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_packed_n(element_type const *ptr, int N){
-        auto lane = get_lane();
-        if(lane < N){
-          m_value = ptr[lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided(element_type const *ptr, int stride){
-
-        auto lane = get_lane();
-
-        m_value = ptr[stride*lane];
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &load_strided_n(element_type const *ptr, int stride, int N){
-        auto lane = get_lane();
-
-        if(lane < N){
-          m_value = ptr[stride*lane];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get_raw_value()];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(get_lane() < N){
-          m_value = ptr[offsets.get_raw_value()];
-        }
-        else{
-          m_value = element_type(0);
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays.
-       *
-       * The default operation combines the s_segmented_offsets and gather
-       * operations.
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load(element_type const *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer){
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        m_value = ptr[seg*stride_outer + i*stride_inner];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented load operation used for loading sub-matrices
-       * from larger arrays where we load partial segments.
-       *
-       *
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &segmented_load_nm(element_type const *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer)
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          m_value = element_type(0);
-        }
-        else{
-          m_value = ptr[seg*stride_outer + i*stride_inner];
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed(element_type *ptr) const{
-
-        auto lane = get_lane();
-
-        ptr[lane] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_packed_n(element_type *ptr, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided(element_type *ptr, int stride) const{
-
-        auto lane = get_lane();
-
-        ptr[lane*stride] = m_value;
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type const &store_strided_n(element_type *ptr, int stride, int N) const{
-
-        auto lane = get_lane();
-
-        if(lane < N){
-          ptr[lane*stride] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, T2 const &offsets) const {
-
-        ptr[offsets.get_raw_value()] = m_value;
-
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      template<typename T2>
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, T2 const &offsets, camp::idx_t N) const {
-        if(get_lane() < N){
-          ptr[offsets.get_raw_value()] = m_value;
-        }
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store(element_type *ptr, camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer) const {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        ptr[seg*stride_outer + i*stride_inner] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic segmented store operation used for storing sub-matrices
-       * to larger arrays where we store partial segments.
-       *
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type const &segmented_store_nm(element_type *ptr, camp::idx_t segbits,
-          camp::idx_t stride_inner, camp::idx_t stride_outer,
-          camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          ptr[seg*stride_outer + i*stride_inner] = m_value;
-        }
-
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type get(int i) const
-			{
-        return hip::impl::shfl_sync(m_value, i);
-			}
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type &set(element_type value, int i)
-			{
-				auto lane = get_lane();
-      	if(lane == i){
-					m_value = value;
-				}
-        return *this;
-			}
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-      /*!
-       * @brief Extracts a scalar value and broadcasts to a new register
-       */
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type get_and_broadcast(int i) const {
-        self_type x;
-        x.m_value = hip::impl::shfl_sync(m_value, i, 32);
-        return x;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-
-      RAJA_DEVICE
-      RAJA_INLINE
-      self_type divide_n(self_type const &b, int N) const {
-        return get_lane() < N ? self_type(m_value / b.m_value) : self_type(element_type(0));
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMA
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  RAJA_DEVICE
+
+  RAJA_INLINE
+  self_type divide_n(self_type const& b, int N) const
+  {
+    return get_lane() < N ? self_type(m_value / b.m_value)
+                          : self_type(element_type(0));
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMA
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMA, do unfused ops
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_add(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value + c.m_value);
-      }
-
-      /**
-       * floats and doubles use the CUDA instrinsic FMS
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_add(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value + c.m_value);
+  }
+
+  /**
+   * floats and doubles use the CUDA instrinsic FMS
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<!std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(fma(m_value, b.m_value, -c.m_value));
-      }
-
-      /**
-       * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
-       */
-      template<typename RETURN_TYPE = self_type>
-      RAJA_DEVICE
-      RAJA_INLINE
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(fma(m_value, b.m_value, -c.m_value));
+  }
+
+  /**
+   * int32 and int64 don't have a CUDA intrinsic FMS, do unfused ops
+   */
+  template<typename RETURN_TYPE = self_type>
+  RAJA_DEVICE RAJA_INLINE
       typename std::enable_if<std::numeric_limits<element_type>::is_integer,
-      RETURN_TYPE>::type
-      multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return self_type(m_value * b.m_value - c.m_value);
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type sum() const
-      {
-				// Allreduce sum
-				using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
-
-				return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max() const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type max_n(int N) const
-      {
-        // Allreduce maximum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::maximum>;
-
-        auto ident = RAJA::operators::limits<element_type>::min();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmax(self_type a) const
-      {
-        return self_type{RAJA::max<element_type>(m_value, a.m_value)};
-      }
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min() const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
-
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      element_type min_n(int N) const
-      {
-        // Allreduce minimum
-        using combiner_t = RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::minimum>;
-
-        auto ident = RAJA::operators::limits<element_type>::max();
-        auto lane = get_lane();
-        auto value = lane < N ? m_value : ident;
-        return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type vmin(self_type a) const
-      {
-        return self_type{RAJA::min<element_type>(m_value, a.m_value)};
-      }
-
-
-
-
-      /*!
-       * Provides gather/scatter indices for segmented loads and stores
-       *
-       * THe number of segment bits (segbits) is specified, as well as the
-       * stride between elements in a segment (stride_inner),
-       * and the stride between segments (stride_outer)
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      static
-      int_vector_type s_segmented_offsets(camp::idx_t segbits, camp::idx_t stride_inner, camp::idx_t stride_outer)
-      {
-        int_vector_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        result.get_raw_value() = seg*stride_outer + i*stride_inner;
-
-        return result;
-      }
-
-
-      /*!
-       * Sum elements within each segment, with segment size defined by segbits.
-       * Stores each segments sum consecutively, but shifed to the
-       * corresponding output_segment slot.
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums neighboring pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_inner(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int delta = 1;delta < 1<<segbits;delta = delta<<1){
-
-          // tree shuffle
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_lane()<<segbits);
-
-        // Third: mask off everything but output_segment
-        //        this is because all output segments are valid at this point
-        // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
-        int our_output_segment = get_lane()>>(6-segbits);
-        bool in_output_segment = our_output_segment == output_segment;
-        if(!in_output_segment){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      /*!
-       * Sum across segments, with segment size defined by segbits
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 is equivalent to the input vector,  since there are 8
-       *      outputs, there is only 1 output segment
-       *
-       *      Result= x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=1 sums strided pairs of values.  There are 4 output,
-       *      so there are possible output segments.
-       *
-       *      output_segment=0:
-       *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
-       *
-       *      output_segment=1:
-       *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
-       *
-       *  and so on up to segbits=3, which is a full sum of x0..x7, and the
-       *      output_segment denotes the vector position of the sum
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_sum_outer(camp::idx_t segbits, camp::idx_t output_segment) const
-      {
-
-        // First: tree reduce values within each segment
-        element_type x = m_value;
-        RAJA_UNROLL
-        for(int i = 0;i < 6-segbits; ++ i){
-
-          // tree shuffle
-          int delta = s_num_elem >> (i+1);
-          element_type y = hip::impl::shfl_sync(x, get_lane()+delta);
-
-          // reduce
-          x += y;
-        }
-
-        // Second: send result to output segment lanes
-        self_type result;
-        int get_from = get_lane()&( (1<<segbits)-1);
-        result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
-
-        int mask = (get_lane()>>segbits) == output_segment;
-
-
-        // Third: mask off everything but output_segment
-        if(!mask){
-          result.get_raw_value() = 0;
-        }
-
-        return result;
-      }
-
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_divide_nm(self_type den, camp::idx_t segbits, camp::idx_t num_inner, camp::idx_t num_outer) const
-      {
-        self_type result;
-
-        auto lane = get_lane();
-
-        // compute segment and segment_size
-        auto seg = lane >> segbits;
-        auto i = lane & ((1<<segbits)-1);
-
-        if(seg >= num_outer || i >= num_inner){
-          // nop
-        }
-        else{
-          result.get_raw_value() = m_value / den.get_raw_value();
-        }
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast copies a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x0, x1, x0, x1, x0, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x3, x2, x3, x2, x3, x2, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x7, x6, x7, x6, x7, x6, x7
-       *
-       *  and so on up to segbits=2, the input segments are 4 wide:
-       *
-       *      input segments allowed are from 0 or 1
-       *
-       *      output_segment=0:
-       *      Result= x0, x1, x2, x3, x0, x1, x2, x3
-       *
-       *      output_segment=1:
-       *      Result= x4, x5, x6, x7, x4, x5, x6, x7
-       *
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_inner(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t mask = (1<<segbits)-1;
-        camp::idx_t offset = input_segment << segbits;
-
-
-        camp::idx_t i = (get_lane()&mask) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-
-        return result;
-      }
-
-
-      /*!
-       * Segmented broadcast spreads a segment to all output segments of a vector
-       *
-       * Note: segment size is 1<<segbits elements
-       *       number of segments is s_num_elem>>seg_bits
-       *
-       *
-       *  Example:
-       *
-       *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
-       *
-       *  segbits=0 means the input segment size is 1, so this selects the
-       *      value at x[input_segmnet] and broadcasts it to the rest of the
-       *      vector
-       *
-       *      input segments allowed are from 0 to 7, inclusive
-       *
-       *      input_segment=0
-       *      Result= x0, x0, x0, x0, x0, x0, x0, x0
-       *
-       *      input_segment=5
-       *      Result= x5, x5, x5, x5, x5, x5, x5, x5
-       *
-       *  segbits=1 means that the input segments are each pair of x values:
-       *
-       *      input segments allowed are from 0 to 3, inclusive
-       *
-       *      output_segment=0:
-       *      Result= x0, x0, x0, x0, x1, x1, x1, x1
-       *
-       *      output_segment=1:
-       *      Result= x2, x2, x2, x2, x3, x3, x3, x3
-       *
-       *      output_segment=3:
-       *      Result= x6, x6, x6, x6, x7, x7, x7, x7
-       */
-      RAJA_INLINE
-      RAJA_DEVICE
-      self_type segmented_broadcast_outer(camp::idx_t segbits, camp::idx_t input_segment) const
-      {
-        self_type result;
-
-        camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
-
-        camp::idx_t i = (get_lane() >> segbits) + offset;
-
-        result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
-
-        return result;
-      }
-
-
-
-
-  };
-
-
-
-}   // namespace expt
-
-} // namespace RAJA
-
-
-#endif // Guard
-
-#endif // HIP
+                              RETURN_TYPE>::type
+      multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return self_type(m_value * b.m_value - c.m_value);
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type sum() const
+  {
+    // Allreduce sum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type, RAJA::operators::plus>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type max() const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type max_n(int N) const
+  {
+    // Allreduce maximum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::maximum>;
+
+    auto ident = RAJA::operators::limits<element_type>::min();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type vmax(self_type a) const
+  {
+    return self_type {RAJA::max<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type min() const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(m_value);
+  }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  element_type min_n(int N) const
+  {
+    // Allreduce minimum
+    using combiner_t =
+        RAJA::reduce::detail::op_adapter<element_type,
+                                         RAJA::operators::minimum>;
+
+    auto ident = RAJA::operators::limits<element_type>::max();
+    auto lane  = get_lane();
+    auto value = lane < N ? m_value : ident;
+    return RAJA::hip::impl::warp_allreduce<combiner_t, element_type>(value);
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type vmin(self_type a) const
+  {
+    return self_type {RAJA::min<element_type>(m_value, a.m_value)};
+  }
+
+  /*!
+   * Provides gather/scatter indices for segmented loads and stores
+   *
+   * THe number of segment bits (segbits) is specified, as well as the
+   * stride between elements in a segment (stride_inner),
+   * and the stride between segments (stride_outer)
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  static int_vector_type s_segmented_offsets(camp::idx_t segbits,
+                                             camp::idx_t stride_inner,
+                                             camp::idx_t stride_outer)
+  {
+    int_vector_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    result.get_raw_value() = seg * stride_outer + i * stride_inner;
+
+    return result;
+  }
+
+  /*!
+   * Sum elements within each segment, with segment size defined by segbits.
+   * Stores each segments sum consecutively, but shifed to the
+   * corresponding output_segment slot.
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums neighboring pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x1, x2+x3, x4+x5, x6+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x1, x2+x3, x4+x5, x6+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_sum_inner(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int delta = 1; delta < 1 << segbits; delta = delta << 1)
+    {
+
+      // tree shuffle
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_lane() << segbits);
+
+    // Third: mask off everything but output_segment
+    //        this is because all output segments are valid at this point
+    // (5-segbits), the 5 is since the warp-width is 32 == 1<<5
+    int our_output_segment = get_lane() >> (6 - segbits);
+    bool in_output_segment = our_output_segment == output_segment;
+    if (!in_output_segment)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  /*!
+   * Sum across segments, with segment size defined by segbits
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 is equivalent to the input vector,  since there are 8
+   *      outputs, there is only 1 output segment
+   *
+   *      Result= x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=1 sums strided pairs of values.  There are 4 output,
+   *      so there are possible output segments.
+   *
+   *      output_segment=0:
+   *      Result= x0+x4, x1+x5, x2+x6, x3+x7, 0, 0, 0, 0
+   *
+   *      output_segment=1:
+   *      Result= 0, 0, 0, 0, x0+x4, x1+x5, x2+x6, x3+x7
+   *
+   *  and so on up to segbits=3, which is a full sum of x0..x7, and the
+   *      output_segment denotes the vector position of the sum
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_sum_outer(camp::idx_t segbits,
+                                camp::idx_t output_segment) const
+  {
+
+    // First: tree reduce values within each segment
+    element_type x = m_value;
+    RAJA_UNROLL
+    for (int i = 0; i < 6 - segbits; ++i)
+    {
+
+      // tree shuffle
+      int delta      = s_num_elem >> (i + 1);
+      element_type y = hip::impl::shfl_sync(x, get_lane() + delta);
+
+      // reduce
+      x += y;
+    }
+
+    // Second: send result to output segment lanes
+    self_type result;
+    int get_from           = get_lane() & ((1 << segbits) - 1);
+    result.get_raw_value() = hip::impl::shfl_sync(x, get_from);
+
+    int mask = (get_lane() >> segbits) == output_segment;
+
+
+    // Third: mask off everything but output_segment
+    if (!mask)
+    {
+      result.get_raw_value() = 0;
+    }
+
+    return result;
+  }
+
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_divide_nm(self_type den,
+                                camp::idx_t segbits,
+                                camp::idx_t num_inner,
+                                camp::idx_t num_outer) const
+  {
+    self_type result;
+
+    auto lane = get_lane();
+
+    // compute segment and segment_size
+    auto seg = lane >> segbits;
+    auto i   = lane & ((1 << segbits) - 1);
+
+    if (seg >= num_outer || i >= num_inner)
+    {
+      // nop
+    }
+    else
+    {
+      result.get_raw_value() = m_value / den.get_raw_value();
+    }
+
+    return result;
+  }
+
+  /*!
+   * Segmented broadcast copies a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x0, x1, x0, x1, x0, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x3, x2, x3, x2, x3, x2, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x7, x6, x7, x6, x7, x6, x7
+   *
+   *  and so on up to segbits=2, the input segments are 4 wide:
+   *
+   *      input segments allowed are from 0 or 1
+   *
+   *      output_segment=0:
+   *      Result= x0, x1, x2, x3, x0, x1, x2, x3
+   *
+   *      output_segment=1:
+   *      Result= x4, x5, x6, x7, x4, x5, x6, x7
+   *
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_broadcast_inner(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t mask   = (1 << segbits) - 1;
+    camp::idx_t offset = input_segment << segbits;
+
+
+    camp::idx_t i = (get_lane() & mask) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+
+    return result;
+  }
+
+  /*!
+   * Segmented broadcast spreads a segment to all output segments of a vector
+   *
+   * Note: segment size is 1<<segbits elements
+   *       number of segments is s_num_elem>>seg_bits
+   *
+   *
+   *  Example:
+   *
+   *  Given input vector  X = x0, x1, x2, x3, x4, x5, x6, x7
+   *
+   *  segbits=0 means the input segment size is 1, so this selects the
+   *      value at x[input_segmnet] and broadcasts it to the rest of the
+   *      vector
+   *
+   *      input segments allowed are from 0 to 7, inclusive
+   *
+   *      input_segment=0
+   *      Result= x0, x0, x0, x0, x0, x0, x0, x0
+   *
+   *      input_segment=5
+   *      Result= x5, x5, x5, x5, x5, x5, x5, x5
+   *
+   *  segbits=1 means that the input segments are each pair of x values:
+   *
+   *      input segments allowed are from 0 to 3, inclusive
+   *
+   *      output_segment=0:
+   *      Result= x0, x0, x0, x0, x1, x1, x1, x1
+   *
+   *      output_segment=1:
+   *      Result= x2, x2, x2, x2, x3, x3, x3, x3
+   *
+   *      output_segment=3:
+   *      Result= x6, x6, x6, x6, x7, x7, x7, x7
+   */
+  RAJA_INLINE
+
+  RAJA_DEVICE
+  self_type segmented_broadcast_outer(camp::idx_t segbits,
+                                      camp::idx_t input_segment) const
+  {
+    self_type result;
+
+    camp::idx_t offset = input_segment * (self_type::s_num_elem >> segbits);
+
+    camp::idx_t i = (get_lane() >> segbits) + offset;
+
+    result.get_raw_value() = hip::impl::shfl_sync(m_value, i);
+
+    return result;
+  }
+};
+
+
+}  // namespace expt
+
+}  // namespace RAJA
+
+
+#endif  // Guard
+
+#endif  // HIP
diff --git a/include/RAJA/policy/tensor/arch/hip/traits.hpp b/include/RAJA/policy/tensor/arch/hip/traits.hpp
index 4c4d959599..807a71e924 100644
--- a/include/RAJA/policy/tensor/arch/hip/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/hip/traits.hpp
@@ -21,26 +21,29 @@
 #ifndef RAJA_policy_tensor_arch_hip_traits_HPP
 #define RAJA_policy_tensor_arch_hip_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
-
-  template<typename T>
-  struct RegisterTraits<RAJA::expt::hip_wave_register, T>{
-      using element_type = T;
-      using register_policy = RAJA::expt::hip_wave_register;
-      static constexpr camp::idx_t s_num_elem = 64;
-      static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
-      using int_element_type = int32_t;
-  };
-
-} // namespace internal
-} // namespace expt
-} // namespace RAJA
-
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
+
+template<typename T>
+struct RegisterTraits<RAJA::expt::hip_wave_register, T>
+{
+  using element_type                      = T;
+  using register_policy                   = RAJA::expt::hip_wave_register;
+  static constexpr camp::idx_t s_num_elem = 64;
+  static constexpr camp::idx_t s_num_bits = sizeof(T) * s_num_elem;
+  using int_element_type                  = int32_t;
+};
+
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 
 #endif
 
 
-#endif // RAJA_ENABLE_HIP
+#endif  // RAJA_ENABLE_HIP
diff --git a/include/RAJA/policy/tensor/arch/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar.hpp
index 5e139f41f0..29b3788e80 100644
--- a/include/RAJA/policy/tensor/arch/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar.hpp
@@ -16,16 +16,12 @@
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
 
 
-
 #ifndef RAJA_policy_tensor_arch_scalar_HPP
 #define RAJA_policy_tensor_arch_scalar_HPP
 
 
-
-#include<RAJA/policy/tensor/arch/scalar/traits.hpp>
-#include<RAJA/policy/tensor/arch/scalar/scalar.hpp>
+#include <RAJA/policy/tensor/arch/scalar/traits.hpp>
+#include <RAJA/policy/tensor/arch/scalar/scalar.hpp>
 
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
index 139c5d27a5..77c75132bd 100644
--- a/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/scalar.hpp
@@ -22,449 +22,482 @@
 
 namespace RAJA
 {
-namespace expt {
+namespace expt
+{
+
+/**
+ * A specialization for a single element register.
+ * We will implement this as a scalar value, and let the compiler use
+ * whatever registers it deems appropriate.
+ */
+template<typename T>
+class Register<T, scalar_register>
+    : public internal::expt::RegisterBase<Register<T, scalar_register>>
+{
+public:
+  using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
+
+  using register_policy = scalar_register;
+  using self_type       = Register<T, scalar_register>;
+  using element_type    = T;
+  using register_type   = T;
+
+  using int_vector_type =
+      Register<typename internal::expt::RegisterTraits<scalar_register,
+                                                       T>::int_element_type,
+               scalar_register>;
+
+
+private:
+  T m_value;
+
+public:
+  static constexpr camp::idx_t s_num_elem = 1;
+
+  /*!
+   * @brief Default constructor, zeros register contents
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr Register() : base_type(), m_value(0) {}
+
+  /*!
+   * @brief Copy constructor from underlying simd register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr Register(element_type const& c) : base_type(), m_value(c) {}
+
+  /*!
+   * @brief Copy constructor
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr Register(self_type const& c) : base_type(), m_value(c.m_value) {}
+
+  /*!
+   * @brief Copy assignment constructor
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& operator=(self_type const& c)
+  {
+    m_value = c.m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Load a full register from a stride-one memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_packed(element_type const* ptr)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_packed_n(element_type const* ptr, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Gather a full register from a strided memory location
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_strided(element_type const* ptr, camp::idx_t)
+  {
+    m_value = ptr[0];
+    return *this;
+  }
+
+  /*!
+   * @brief Partially load a register from a stride-one memory location given
+   *        a run-time number of elements.
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& load_strided_n(element_type const* ptr, camp::idx_t, camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[0];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather(element_type const* ptr, int_vector_type offsets)
+  {
+
+    m_value = ptr[offsets.get(0)];
+
+    return *this;
+  }
+
+  /*!
+   * @brief Generic gather operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be loaded relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type& gather_n(element_type const* ptr,
+                      int_vector_type offsets,
+                      camp::idx_t N)
+  {
+    if (N > 0)
+    {
+      m_value = ptr[offsets.get(0)];
+    }
+    else
+    {
+      m_value = element_type(0);
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_packed(element_type* ptr) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_packed_n(element_type* ptr, camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Store entire register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_strided(element_type* ptr, camp::idx_t) const
+  {
+    ptr[0] = m_value;
+    return *this;
+  }
+
+  /*!
+   * @brief Store partial register to consecutive memory locations
+   *
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type const& store_strided_n(element_type* ptr,
+                                   camp::idx_t,
+                                   camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[0] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Generic scatter operation for full vector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
+   */
+  RAJA_INLINE
+  self_type const& scatter(element_type* ptr, int_vector_type offsets) const
+  {
+
+    ptr[offsets.get(0)] = m_value;
+
+    return *this;
+  }
 
-  /**
-   * A specialization for a single element register.
-   * We will implement this as a scalar value, and let the compiler use
-   * whatever registers it deems appropriate.
+  /*!
+   * @brief Generic scatter operation for n-length subvector.
+   *
+   * Must provide another register containing offsets of all values
+   * to be stored relative to supplied pointer.
+   *
+   * Offsets are element-wise, not byte-wise.
+   *
    */
-  template<typename T>
-  class Register<T, scalar_register> :
-      public internal::expt::RegisterBase<Register<T, scalar_register>>
+  RAJA_INLINE
+  self_type const& scatter_n(element_type* ptr,
+                             int_vector_type offsets,
+                             camp::idx_t N) const
+  {
+    if (N > 0)
+    {
+      ptr[offsets.get(0)] = m_value;
+    }
+    return *this;
+  }
+
+  /*!
+   * @brief Get scalar value from vector register
+   * @param i Offset of scalar to get
+   * @return Returns scalar value at i
+   */
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE element_type get(camp::idx_t) const
+  {
+    return m_value;
+  }
+
+  /*!
+   * @brief Set scalar value in vector register
+   * @param i Offset of scalar to set
+   * @param value Value of scalar to set
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type& set(element_type value, camp::idx_t)
+  {
+    m_value = value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& broadcast(element_type const& a)
+  {
+    m_value = a;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type& copy(self_type const& src)
+  {
+    m_value = src.m_value;
+    return *this;
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type add(self_type const& b) const
+  {
+    return self_type(m_value + b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type subtract(self_type const& b) const
+  {
+    return self_type(m_value - b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type multiply(self_type const& b) const
+  {
+    return self_type(m_value * b.m_value);
+  }
+
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type divide(self_type const& b) const
+  {
+    return self_type(m_value / b.m_value);
+  }
+
+  /*!
+   * @brief Fused multiply add: fma(b, c) = (*this)*b+c
+   *
+   * Derived types can override this to implement intrinsic FMA's
+   *
+   * @param b Second product operand
+   * @param c Sum operand
+   * @return Value of (*this)*b+c
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_add(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value + c.m_value;
+  }
+
+  /*!
+   * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
+   *
+   * Derived types can override this to implement intrinsic FMS's
+   *
+   * @param b Second product operand
+   * @param c Subtraction operand
+   * @return Value of (*this)*b-c
+   */
+  RAJA_INLINE
+
+  RAJA_HOST_DEVICE
+  self_type multiply_subtract(self_type const& b, self_type const& c) const
+  {
+    return m_value * b.m_value - c.m_value;
+  }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr element_type sum() const { return m_value; }
+
+  /*!
+   * @brief Sum the elements of this vector
+   * @return Sum of the values of the vectors scalar elements
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr element_type dot(self_type const& b) const
+  {
+    return m_value * b.m_value;
+  }
+
+  /*!
+   * @brief Returns the largest element
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  constexpr element_type max() const { return m_value; }
+
+  /*!
+   * @brief Returns the largest element from first N lanes
+   * @return The largest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  element_type max_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::min();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type vmax(self_type a) const
+  {
+    return self_type(RAJA::max<element_type>(m_value, a.m_value));
+  }
+
+  /*!
+   * @brief Returns the smallest element
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  element_type min() const { return m_value; }
+
+  /*!
+   * @brief Returns the smallest element from first N lanes
+   * @return The smallest scalar element in the register
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  element_type min_n(camp::idx_t N) const
+  {
+    return N ? m_value : RAJA::operators::limits<element_type>::max();
+    ;
+  }
+
+  /*!
+   * @brief Returns element-wise largest values
+   * @return Vector of the element-wise max values
+   */
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  self_type vmin(self_type a) const
   {
-    public:
-      using base_type = internal::expt::RegisterBase<Register<T, scalar_register>>;
-
-      using register_policy = scalar_register;
-      using self_type = Register<T, scalar_register>;
-      using element_type = T;
-      using register_type = T;
-
-      using int_vector_type = Register<typename internal::expt::RegisterTraits<scalar_register, T>::int_element_type, scalar_register>;
-
-
-    private:
-      T m_value;
-
-    public:
-
-      static constexpr camp::idx_t s_num_elem = 1;
-
-      /*!
-       * @brief Default constructor, zeros register contents
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register() : base_type(), m_value(0) {
-      }
-
-      /*!
-       * @brief Copy constructor from underlying simd register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(element_type const &c) : base_type(), m_value(c) {}
-
-
-      /*!
-       * @brief Copy constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      Register(self_type const &c) : base_type(), m_value(c.m_value) {}
-
-
-      /*!
-       * @brief Copy assignment constructor
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &operator=(self_type const &c){
-        m_value = c.m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Load a full register from a stride-one memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed(element_type const *ptr){
-        m_value = ptr[0];
-        return *this;
-      }
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_packed_n(element_type const *ptr, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Gather a full register from a strided memory location
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided(element_type const *ptr, camp::idx_t ){
-        m_value = ptr[0];
-        return *this;
-      }
-
-
-      /*!
-       * @brief Partially load a register from a stride-one memory location given
-       *        a run-time number of elements.
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &load_strided_n(element_type const *ptr, camp::idx_t , camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[0];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic gather operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather(element_type const *ptr, int_vector_type offsets){
-
-        m_value = ptr[offsets.get(0)];
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic gather operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be loaded relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type &gather_n(element_type const *ptr, int_vector_type offsets, camp::idx_t N){
-        if(N > 0){
-          m_value = ptr[offsets.get(0)];
-        }
-        else{
-          m_value = element_type(0);
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed(element_type *ptr) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_packed_n(element_type *ptr, camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-      /*!
-       * @brief Store entire register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided(element_type *ptr, camp::idx_t ) const{
-        ptr[0] = m_value;
-        return *this;
-      }
-
-
-      /*!
-       * @brief Store partial register to consecutive memory locations
-       *
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type const &store_strided_n(element_type *ptr, camp::idx_t , camp::idx_t N) const{
-        if(N > 0){
-          ptr[0] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Generic scatter operation for full vector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter(element_type *ptr, int_vector_type offsets) const {
-
-        ptr[offsets.get(0)] = m_value;
-
-        return *this;
-      }
-
-      /*!
-       * @brief Generic scatter operation for n-length subvector.
-       *
-       * Must provide another register containing offsets of all values
-       * to be stored relative to supplied pointer.
-       *
-       * Offsets are element-wise, not byte-wise.
-       *
-       */
-      RAJA_INLINE
-      self_type const &scatter_n(element_type *ptr, int_vector_type offsets, camp::idx_t N) const {
-        if(N > 0){
-          ptr[offsets.get(0)] = m_value;
-        }
-        return *this;
-      }
-
-
-      /*!
-       * @brief Get scalar value from vector register
-       * @param i Offset of scalar to get
-       * @return Returns scalar value at i
-       */
-      constexpr
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      element_type get(camp::idx_t) const
-      {return m_value;}
-
-
-      /*!
-       * @brief Set scalar value in vector register
-       * @param i Offset of scalar to set
-       * @param value Value of scalar to set
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type &set(element_type value, camp::idx_t)
-      {
-        m_value = value;
-        return *this;
-      }
-
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &broadcast(element_type const &a){
-        m_value = a;
-        return *this;
-      }
-
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type &copy(self_type const &src){
-        m_value = src.m_value;
-        return *this;
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type add(self_type const &b) const {
-        return self_type(m_value + b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type subtract(self_type const &b) const {
-        return self_type(m_value - b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type multiply(self_type const &b) const {
-        return self_type(m_value * b.m_value);
-      }
-
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type divide(self_type const &b) const {
-        return self_type(m_value / b.m_value);
-      }
-
-      /*!
-       * @brief Fused multiply add: fma(b, c) = (*this)*b+c
-       *
-       * Derived types can override this to implement intrinsic FMA's
-       *
-       * @param b Second product operand
-       * @param c Sum operand
-       * @return Value of (*this)*b+c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_add(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value + c.m_value;
-      }
-
-      /*!
-       * @brief Fused multiply subtract: fms(b, c) = (*this)*b-c
-       *
-       * Derived types can override this to implement intrinsic FMS's
-       *
-       * @param b Second product operand
-       * @param c Subtraction operand
-       * @return Value of (*this)*b-c
-       */
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      self_type multiply_subtract(self_type const &b, self_type const &c) const
-      {
-        return m_value * b.m_value - c.m_value;
-      }
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type sum() const
-      {
-        return m_value;
-      }
-
-
-      /*!
-       * @brief Sum the elements of this vector
-       * @return Sum of the values of the vectors scalar elements
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type dot(self_type const &b) const
-      {
-        return m_value * b.m_value;
-      }
-
-
-      /*!
-       * @brief Returns the largest element
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      constexpr
-      element_type max() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the largest element from first N lanes
-       * @return The largest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type max_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::min();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmax(self_type a) const
-      {
-        return self_type(RAJA::max<element_type>(m_value, a.m_value));
-      }
-
-      /*!
-       * @brief Returns the smallest element
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min() const
-      {
-        return m_value;
-      }
-
-      /*!
-       * @brief Returns the smallest element from first N lanes
-       * @return The smallest scalar element in the register
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      element_type min_n(camp::idx_t N) const
-      {
-        return N ? m_value : RAJA::operators::limits<element_type>::max();;
-      }
-
-      /*!
-       * @brief Returns element-wise largest values
-       * @return Vector of the element-wise max values
-       */
-      RAJA_HOST_DEVICE
-      RAJA_INLINE
-      self_type vmin(self_type a) const
-      {
-        return self_type(RAJA::min<element_type>(m_value, a.m_value));
-      }
-
-
-
-  };
-} // namespace expt
+    return self_type(RAJA::min<element_type>(m_value, a.m_value));
+  }
+};
+}  // namespace expt
 }  // namespace RAJA
 
 
diff --git a/include/RAJA/policy/tensor/arch/scalar/traits.hpp b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
index dfeccbb86f..80d8b9f2f8 100644
--- a/include/RAJA/policy/tensor/arch/scalar/traits.hpp
+++ b/include/RAJA/policy/tensor/arch/scalar/traits.hpp
@@ -19,52 +19,57 @@
 #ifndef RAJA_policy_tensor_arch_scalar_traits_HPP
 #define RAJA_policy_tensor_arch_scalar_traits_HPP
 
-namespace RAJA {
-namespace internal {
-namespace expt {
+namespace RAJA
+{
+namespace internal
+{
+namespace expt
+{
 
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int32_t>{
-      using element_type = int32_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::scalar_register, int32_t>
+{
+  using element_type                      = int32_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, int64_t>{
-      using element_type = int64_t;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::scalar_register, int64_t>
+{
+  using element_type                      = int64_t;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, float>{
-      using element_type = float;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int32_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::scalar_register, float>
+{
+  using element_type                      = float;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int32_t;
+};
 
-  template<>
-  struct RegisterTraits<RAJA::expt::scalar_register, double>{
-      using element_type = double;
-      using register_policy = RAJA::expt::scalar_register;
-      static constexpr camp::idx_t s_num_bits = sizeof(element_type)*8;
-      static constexpr camp::idx_t s_num_elem = 1;
-      using int_element_type = int64_t;
-  };
+template<>
+struct RegisterTraits<RAJA::expt::scalar_register, double>
+{
+  using element_type                      = double;
+  using register_policy                   = RAJA::expt::scalar_register;
+  static constexpr camp::idx_t s_num_bits = sizeof(element_type) * 8;
+  static constexpr camp::idx_t s_num_elem = 1;
+  using int_element_type                  = int64_t;
+};
 
 
-}
-}
-}
+}  // namespace expt
+}  // namespace internal
+}  // namespace RAJA
 
 #endif
-
-
diff --git a/include/RAJA/policy/tensor/arch_impl.hpp b/include/RAJA/policy/tensor/arch_impl.hpp
index e14451505a..0e7085b5e2 100644
--- a/include/RAJA/policy/tensor/arch_impl.hpp
+++ b/include/RAJA/policy/tensor/arch_impl.hpp
@@ -22,7 +22,6 @@
 #include "RAJA/policy/tensor/arch.hpp"
 
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -32,30 +31,29 @@
 //
 
 #ifdef __AVX512F__
-#include<RAJA/policy/tensor/arch/avx512.hpp>
+#include <RAJA/policy/tensor/arch/avx512.hpp>
 #endif
 
 
 #ifdef __AVX2__
-#include<RAJA/policy/tensor/arch/avx2.hpp>
+#include <RAJA/policy/tensor/arch/avx2.hpp>
 #endif
 
 
 #ifdef __AVX__
-#include<RAJA/policy/tensor/arch/avx.hpp>
+#include <RAJA/policy/tensor/arch/avx.hpp>
 #endif
 
 #ifdef RAJA_CUDA_ACTIVE
-#include<RAJA/policy/tensor/arch/cuda.hpp>
+#include <RAJA/policy/tensor/arch/cuda.hpp>
 #endif
 
 #ifdef RAJA_HIP_ACTIVE
-#include<RAJA/policy/tensor/arch/hip.hpp>
+#include <RAJA/policy/tensor/arch/hip.hpp>
 #endif
 
 // The scalar register is always supported (doesn't require any SIMD/SIMT)
-#include<RAJA/policy/tensor/arch/scalar.hpp>
-
+#include <RAJA/policy/tensor/arch/scalar.hpp>
 
 
 #endif
diff --git a/include/RAJA/policy/tensor/policy.hpp b/include/RAJA/policy/tensor/policy.hpp
index 8618d543b2..15cd886d66 100644
--- a/include/RAJA/policy/tensor/policy.hpp
+++ b/include/RAJA/policy/tensor/policy.hpp
@@ -21,7 +21,6 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/config.hpp"
 
-
 //
 //////////////////////////////////////////////////////////////////////
 //
@@ -40,37 +39,42 @@ namespace policy
 namespace tensor
 {
 
-template<typename EXEC_POLICY, typename TENSOR_TYPE, camp::idx_t DIM, camp::idx_t TILE_SIZE>
-struct tensor_exec : public EXEC_POLICY {
+template<typename EXEC_POLICY,
+         typename TENSOR_TYPE,
+         camp::idx_t DIM,
+         camp::idx_t TILE_SIZE>
+struct tensor_exec : public EXEC_POLICY
+{
   using exec_policy = EXEC_POLICY;
   using tensor_type = TENSOR_TYPE;
 
   static constexpr camp::idx_t s_tensor_dim = DIM;
-  static constexpr camp::idx_t s_tile_size = TILE_SIZE;
+  static constexpr camp::idx_t s_tile_size  = TILE_SIZE;
 };
 
 
-
 }  // end of namespace tensor
 
 }  // end of namespace policy
 
-namespace expt {
+namespace expt
+{
 
 
 template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using vector_exec = policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+using vector_exec =
+    policy::tensor::tensor_exec<RAJA::seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
 template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_row_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
+using matrix_row_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 0, TILE_SIZE>;
 
 template<typename TENSOR_TYPE, camp::idx_t TILE_SIZE = -1>
-using matrix_col_exec = policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
-
-
-} //  namespace expt
+using matrix_col_exec =
+    policy::tensor::tensor_exec<seq_exec, TENSOR_TYPE, 1, TILE_SIZE>;
 
 
+}  //  namespace expt
 
 
 }  // end of namespace RAJA
diff --git a/include/RAJA/util/BitMask.hpp b/include/RAJA/util/BitMask.hpp
index 63f011b689..1bf250b008 100644
--- a/include/RAJA/util/BitMask.hpp
+++ b/include/RAJA/util/BitMask.hpp
@@ -20,65 +20,64 @@
 
 #include "RAJA/config.hpp"
 
-
 namespace RAJA
 {
 
-  template<camp::idx_t N>
-  struct LogBase2
-  {
-      static constexpr camp::idx_t value = LogBase2<(N>>1)>::value + 1;
-      static constexpr bool is_exact = ((1<<value) == N);
-  };
-
-  template<>
-  struct LogBase2<0>
-  {
-      static constexpr camp::idx_t value = -1;
-      static constexpr bool is_exact = true;
-  };
-
-  /*!
-   * A bit-masking operator
-   *
-   * Provides an operator that shifts and masks in input value to extract
-   * a contiguous set of bits.
-   *
-   * result = (input >> Shift) & (Mask)
-   *
-   * Where mask is (1<<Width)-1, or the number of bits defined by Width.
-   *
-   *
-   */
-  template<int Width, int Shift>
-  struct BitMask {
-    static constexpr int shift = Shift;
-    static constexpr int width = Width;
-    static constexpr int max_input_size = 1<<(Shift+Width);
-    static constexpr int max_masked_size = 1<<Width;
-    static constexpr int max_shifted_size = 1<<Shift;
+template<camp::idx_t N>
+struct LogBase2
+{
+  static constexpr camp::idx_t value = LogBase2<(N >> 1)>::value + 1;
+  static constexpr bool is_exact     = ((1 << value) == N);
+};
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskValue(T input) {
-      return( (input>>( static_cast<T>(Shift) )) & static_cast<T>((1<<(Width))-1) );
-    }
+template<>
+struct LogBase2<0>
+{
+  static constexpr camp::idx_t value = -1;
+  static constexpr bool is_exact     = true;
+};
 
+/*!
+ * A bit-masking operator
+ *
+ * Provides an operator that shifts and masks in input value to extract
+ * a contiguous set of bits.
+ *
+ * result = (input >> Shift) & (Mask)
+ *
+ * Where mask is (1<<Width)-1, or the number of bits defined by Width.
+ *
+ *
+ */
+template<int Width, int Shift>
+struct BitMask
+{
+  static constexpr int shift            = Shift;
+  static constexpr int width            = Width;
+  static constexpr int max_input_size   = 1 << (Shift + Width);
+  static constexpr int max_masked_size  = 1 << Width;
+  static constexpr int max_shifted_size = 1 << Shift;
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T getOuter(T input) {
-      return(  (input>>(static_cast<T>(Shift))) >> Width );
-    }
+  template<typename T>
+  RAJA_HOST_DEVICE static constexpr T maskValue(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) &
+            static_cast<T>((1 << (Width)) - 1));
+  }
 
-    template<typename T>
-    RAJA_HOST_DEVICE
-    static constexpr T maskOuter(T input) {
-      return( input & (static_cast<T>(-1) << (Width+Shift) )  );
-    }
+  template<typename T>
+  RAJA_HOST_DEVICE static constexpr T getOuter(T input)
+  {
+    return ((input >> (static_cast<T>(Shift))) >> Width);
+  }
 
-  };
+  template<typename T>
+  RAJA_HOST_DEVICE static constexpr T maskOuter(T input)
+  {
+    return (input & (static_cast<T>(-1) << (Width + Shift)));
+  }
+};
 
 }  // namespace RAJA
 
-#endif //RAJA_util_BitMask_HPP
+#endif  // RAJA_util_BitMask_HPP
diff --git a/include/RAJA/util/CombiningAdapter.hpp b/include/RAJA/util/CombiningAdapter.hpp
index abe8197b93..f74249391c 100644
--- a/include/RAJA/util/CombiningAdapter.hpp
+++ b/include/RAJA/util/CombiningAdapter.hpp
@@ -77,16 +77,16 @@ namespace RAJA
  *     }
  *
  */
-template <typename Lambda, typename Layout_>
+template<typename Lambda, typename Layout_>
 struct CombiningAdapter
 {
   using Layout = Layout_;
 
-  using IndexRange = typename Layout::IndexRange;
+  using IndexRange     = typename Layout::IndexRange;
   using StrippedIdxLin = typename Layout::StrippedIdxLin;
-  using IndexLinear = typename Layout::IndexLinear;
-  using DimTuple = typename Layout::DimTuple;
-  using DimArr = typename Layout::DimArr;
+  using IndexLinear    = typename Layout::IndexLinear;
+  using DimTuple       = typename Layout::DimTuple;
+  using DimArr         = typename Layout::DimArr;
 
   using RangeLinear = RAJA::TypedRangeSegment<IndexLinear>;
 
@@ -95,21 +95,24 @@ struct CombiningAdapter
   Layout m_layout;
 
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template<camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>)
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
     return m_lambda(camp::get<RangeInts>(indices)...);
   }
+
   ///
   RAJA_SUPPRESS_HD_WARN
-  template < camp::idx_t... RangeInts >
+  template<camp::idx_t... RangeInts>
   RAJA_HOST_DEVICE inline auto call_helper(IndexLinear linear_index,
                                            camp::idx_seq<RangeInts...>) const
-    -> decltype(m_lambda(camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
+      -> decltype(m_lambda(
+          camp::val<camp::tuple_element_t<RangeInts, DimTuple>>()...))
   {
     DimTuple indices;
     m_layout.toIndices(linear_index, camp::get<RangeInts>(indices)...);
@@ -117,16 +120,14 @@ struct CombiningAdapter
   }
 
 public:
-
   /*!
    * Constructor from lambda and layout.
    */
-  template < typename C_Lambda, typename C_Layout >
+  template<typename C_Lambda, typename C_Layout>
   RAJA_HOST_DEVICE CombiningAdapter(C_Lambda&& lambda, C_Layout&& layout)
-      : m_lambda(std::forward<C_Lambda>(lambda))
-      , m_layout(std::forward<C_Layout>(layout))
-  {
-  }
+      : m_lambda(std::forward<C_Lambda>(lambda)),
+        m_layout(std::forward<C_Layout>(layout))
+  {}
 
   /*!
    * Call the lambda by converting the linear index to multidimensional indices.
@@ -134,13 +135,14 @@ struct CombiningAdapter
    * @return return value of lambda
    */
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index)
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
+
   ///
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(IndexLinear linear_index) const
-    -> decltype(call_helper(linear_index, IndexRange()))
+      -> decltype(call_helper(linear_index, IndexRange()))
   {
     return call_helper(linear_index, IndexRange());
   }
@@ -206,59 +208,66 @@ struct CombiningAdapter
  *     }
  *
  */
-template <typename Lambda, typename Layout>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto make_CombiningAdapter_from_layout(Lambda&& lambda, Layout&& layout)
-  // -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
+template<typename Lambda, typename Layout>
+RAJA_HOST_DEVICE RAJA_INLINE auto make_CombiningAdapter_from_layout(
+    Lambda&& lambda,
+    Layout&& layout)
+// -> CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>
 {
   return CombiningAdapter<camp::decay<Lambda>, camp::decay<Layout>>(
       std::forward<Lambda>(lambda), std::forward<Layout>(layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
-template <typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_CombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+template<typename Lambda, typename... IdxTs>
+RAJA_INLINE auto make_CombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using Layout = RAJA::Layout<sizeof...(IdxTs), IdxLin>;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   Layout layout(static_cast<IdxLin>(distance(begin(segs), end(segs)))...);
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
-        std::move(layout));
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
 ///
 RAJA_SUPPRESS_HD_WARN
-template <typename Perm, typename Lambda, typename... IdxTs>
-RAJA_INLINE
-auto make_PermutedCombiningAdapter(Lambda&& lambda, ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
-  // -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
-  //             camp::val<RAJA::TypedOffsetLayout<
-  //                 typename std::common_type< strip_index_type_t<IdxTs>... >::type,
-  //                 IdxTs...>>()))
+template<typename Perm, typename Lambda, typename... IdxTs>
+RAJA_INLINE auto make_PermutedCombiningAdapter(
+    Lambda&& lambda,
+    ::RAJA::TypedRangeSegment<IdxTs> const&... segs)
+// -> decltype(make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
+//             camp::val<RAJA::TypedOffsetLayout<
+//                 typename std::common_type< strip_index_type_t<IdxTs>...
+//                 >::type, IdxTs...>>()))
 {
-  using std::begin; using std::end; using std::distance;
-  using IdxLin = typename std::common_type< strip_index_type_t<IdxTs>... >::type;
+  using std::begin;
+  using std::distance;
+  using std::end;
+  using IdxLin = typename std::common_type<strip_index_type_t<IdxTs>...>::type;
   using OffsetLayout = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<IdxTs...>>;
 
   auto layout = make_permuted_layout<sizeof...(IdxTs), IdxLin>(
-              {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
-              RAJA::as_array<Perm>::get());
+      {{static_cast<IdxLin>(distance(begin(segs), end(segs)))...}},
+      RAJA::as_array<Perm>::get());
   OffsetLayout offset_layout = OffsetLayout::from_layout_and_offsets(
-        {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
-                                            : static_cast<IdxLin>(0))...}},
+      {{(distance(begin(segs), end(segs)) ? static_cast<IdxLin>(*begin(segs))
+                                          : static_cast<IdxLin>(0))...}},
 
-        std::move(layout));
+      std::move(layout));
   return make_CombiningAdapter_from_layout(std::forward<Lambda>(lambda),
                                            std::move(offset_layout));
 }
diff --git a/include/RAJA/util/EnableIf.hpp b/include/RAJA/util/EnableIf.hpp
index 257e852bf9..61ac083818 100644
--- a/include/RAJA/util/EnableIf.hpp
+++ b/include/RAJA/util/EnableIf.hpp
@@ -29,26 +29,27 @@
 
 #include "RAJA/util/concepts.hpp"
 
-
 namespace RAJA
 {
 namespace util
 {
 
 
-template <typename T, typename TypeList>
+template<typename T, typename TypeList>
 struct is_any_of;
 
-template <typename T, typename... Types>
+template<typename T, typename... Types>
 struct is_any_of<T, ::camp::list<Types...>>
-  : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
+    : ::RAJA::concepts::any_of<::camp::is_same<T, Types>...>
 {};
 
-template <typename T, typename TypeList>
+template<typename T, typename TypeList>
 using enable_if_is_any_of = std::enable_if_t<is_any_of<T, TypeList>::value, T>;
 
-template <typename T, typename TypeList>
-using enable_if_is_none_of = std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value, T>;
+template<typename T, typename TypeList>
+using enable_if_is_none_of =
+    std::enable_if_t<::RAJA::concepts::negate<is_any_of<T, TypeList>>::value,
+                     T>;
 
 
 }  // namespace util
diff --git a/include/RAJA/util/IndexLayout.hpp b/include/RAJA/util/IndexLayout.hpp
index 6bb308d375..6ce6908d1b 100644
--- a/include/RAJA/util/IndexLayout.hpp
+++ b/include/RAJA/util/IndexLayout.hpp
@@ -3,7 +3,8 @@
  *
  * \file
  *
- * \brief   RAJA header file defining the IndexLayout class and IndexList classes.
+ * \brief   RAJA header file defining the IndexLayout class and IndexList
+ *classes.
  *
  ******************************************************************************
  */
@@ -20,60 +21,69 @@
 
 #include "RAJA/util/Layout.hpp"
 
-namespace RAJA 
+namespace RAJA
 {
 
 /*!
-* DirectIndex struct contains call operator that returns the same index that was input
-*
-*/
+ * DirectIndex struct contains call operator that returns the same index that
+ * was input
+ *
+ */
 template<typename IdxLin = Index_type>
-struct DirectIndex {
+struct DirectIndex
+{
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
     return idx;
   }
-
 };
 
 /*!
-* IndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the entry at the input location (idx) of its index list.
-* 
-*/
+ * IndexList struct stores a pointer to an array containing the index list.
+ * Its call operator returns the entry at the input location (idx) of its index
+ * list.
+ *
+ */
 template<typename IdxLin = Index_type>
-struct IndexList {
+struct IndexList
+{
 
-  IdxLin* index_list{nullptr};
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
     return index_list[idx];
   }
-
 };
 
 /*!
-* ConditionalIndexList struct stores a pointer to an array containing the index list.
-* Its call operator returns the same index that was input if the index list is a nullptr, 
-* or otherwise returns the entry at the input location (idx) of its index list.
-* 
-*/
+ * ConditionalIndexList struct stores a pointer to an array containing the index
+ * list. Its call operator returns the same index that was input if the index
+ * list is a nullptr, or otherwise returns the entry at the input location (idx)
+ * of its index list.
+ *
+ */
 template<typename IdxLin = Index_type>
-struct ConditionalIndexList {
+struct ConditionalIndexList
+{
 
-  IdxLin* index_list{nullptr};  
+  IdxLin* index_list {nullptr};
 
-  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(const IdxLin idx) const
+  IdxLin RAJA_INLINE RAJA_HOST_DEVICE constexpr operator()(
+      const IdxLin idx) const
   {
-    if (index_list) {
+    if (index_list)
+    {
       return index_list[idx];
-    } else {
+    }
+    else
+    {
       return idx;
     }
   }
-
 };
 
 namespace internal
@@ -82,88 +92,92 @@ namespace internal
 template<typename Range, typename IdxLin, typename... IndexTypes>
 struct IndexLayout_impl;
 
-template <camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
-struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...> {
-  using IndexRange = camp::idx_seq<RangeInts...>;
+template<camp::idx_t... RangeInts, typename IdxLin, typename... IndexTypes>
+struct IndexLayout_impl<camp::idx_seq<RangeInts...>, IdxLin, IndexTypes...>
+{
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
 
   camp::tuple<IndexTypes...> tuple;
 
-  template <typename... Types>
+  template<typename... Types>
   constexpr RAJA_INLINE IndexLayout_impl(
       camp::tuple<IndexTypes...> index_tuple_in,
       Types... ns)
-      : base_{(ns)...},
+      : base_ {(ns)...},
         tuple(index_tuple_in)
-  {
-  }
+  {}
 
   /*!
    * Computes a linear space index from entries of index lists stored in tuple.
-   * This is accomplished through the inner product of the strides and the 
+   * This is accomplished through the inner product of the strides and the
    * entry in the index list along each dimension.
    * @param indices Indices in the n-dimensional space of this layout
    * @return Linear space index.
-   */  
-  template <typename... Indices>
+   */
+  template<typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       Indices... indices) const
   {
     return sum<IdxLin>(
-      (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
+        (base_.strides[RangeInts] * camp::get<RangeInts>(tuple)(indices))...);
   }
-
 };
 
-} // namespace internal
-
+}  // namespace internal
 
-template <size_t n_dims = 1, typename IdxLin = Index_type, typename... IndexTypes>
+template<size_t n_dims   = 1,
+         typename IdxLin = Index_type,
+         typename... IndexTypes>
 struct IndexLayout
-    : public internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...> {
-  using Base =
-      internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
+    : public internal::
+          IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>
+{
+  using Base = internal::
+      IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>;
 
   using internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
-                                    IdxLin, IndexTypes...>::IndexLayout_impl;
-
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE IndexLayout(
-      const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin, IndexTypes...>&
-          rhs)
-      : Base{rhs}
-  {
-  }
-
+                                   IdxLin,
+                                   IndexTypes...>::IndexLayout_impl;
+
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  IndexLayout(const internal::IndexLayout_impl<camp::make_idx_seq_t<n_dims>,
+                                               IdxLin,
+                                               IndexTypes...>& rhs)
+      : Base {rhs}
+  {}
 };
 
 /*!
- * creates of a camp::tuple of index types 
+ * creates of a camp::tuple of index types
  * (such as DirectIndex, IndexList, or ConditionalIndexList)
  *
  */
-template <typename... IndexTypes>
+template<typename... IndexTypes>
 auto make_index_tuple(IndexTypes... it) -> camp::tuple<IndexTypes...>
 {
-    return camp::tuple<IndexTypes...>(it...);
+  return camp::tuple<IndexTypes...>(it...);
 }
 
 /*!
  * creates an index layout based on the input camp::tuple of index types
  *
- */  
-template <typename IdxLin = Index_type, typename... Types, typename... IndexTypes>
-auto make_index_layout(
-  camp::tuple<IndexTypes...> index_tuple_in,
-  Types... ns) -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
+ */
+template<typename IdxLin = Index_type,
+         typename... Types,
+         typename... IndexTypes>
+auto make_index_layout(camp::tuple<IndexTypes...> index_tuple_in, Types... ns)
+    -> IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>
 {
-    static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
-    return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in, ns...);
+  static_assert(sizeof...(Types) == sizeof...(IndexTypes), "");
+  return IndexLayout<sizeof...(Types), IdxLin, IndexTypes...>(index_tuple_in,
+                                                              ns...);
 }
 
-}
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/KokkosPluginLoader.hpp b/include/RAJA/util/KokkosPluginLoader.hpp
index c5060a0a96..7812306b71 100644
--- a/include/RAJA/util/KokkosPluginLoader.hpp
+++ b/include/RAJA/util/KokkosPluginLoader.hpp
@@ -14,39 +14,44 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
-  {
-  public:
-    using Parent = ::RAJA::util::PluginStrategy;
-    typedef void (*init_function)(const int, const uint64_t, const uint32_t, void*);
-    typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
-    typedef void (*post_function)(uint64_t);
-    typedef void (*finalize_function)();
+class KokkosPluginLoader : public ::RAJA::util::PluginStrategy
+{
+public:
+  using Parent = ::RAJA::util::PluginStrategy;
+  typedef void (*init_function)(const int,
+                                const uint64_t,
+                                const uint32_t,
+                                void*);
+  typedef void (*pre_function)(const char*, const uint32_t, uint64_t*);
+  typedef void (*post_function)(uint64_t);
+  typedef void (*finalize_function)();
 
-    KokkosPluginLoader();
+  KokkosPluginLoader();
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+private:
+  void initPlugin(const std::string& path);
 
-    std::vector<init_function> init_functions;
-    std::vector<pre_function> pre_functions;
-    std::vector<post_function> post_functions;
-    std::vector<finalize_function> finalize_functions;
+  void initDirectory(const std::string& path);
 
-  };  // end KokkosPluginLoader class
+  std::vector<init_function> init_functions;
+  std::vector<pre_function> pre_functions;
+  std::vector<post_function> post_functions;
+  std::vector<finalize_function> finalize_functions;
 
-  void linkKokkosPluginLoader();
+};  // end KokkosPluginLoader class
+
+void linkKokkosPluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/Layout.hpp b/include/RAJA/util/Layout.hpp
index 948e37f498..02831d7396 100644
--- a/include/RAJA/util/Layout.hpp
+++ b/include/RAJA/util/Layout.hpp
@@ -38,28 +38,30 @@ namespace detail
 {
 
 
-
-template <typename Range,
-          typename IdxLin = Index_type,
-          ptrdiff_t StrideOneDim = -1>
+template<typename Range,
+         typename IdxLin        = Index_type,
+         ptrdiff_t StrideOneDim = -1>
 struct LayoutBase_impl;
 
 /*!
  * Helper function to compute the strides
  */
 
-template <size_t j, size_t n_dims, typename IdxLin = Index_type>
-struct stride_calculator {
+template<size_t j, size_t n_dims, typename IdxLin = Index_type>
+struct stride_calculator
+{
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       IdxLin cur_stride,
       IdxLin const (&sizes)[n_dims]) const
   {
-    return stride_calculator<j + 1, n_dims, IdxLin>{}(
+    return stride_calculator<j + 1, n_dims, IdxLin> {}(
         cur_stride * (sizes[j] ? sizes[j] : 1), sizes);
   }
 };
-template <size_t n_dims, typename IdxLin>
-struct stride_calculator<n_dims, n_dims, IdxLin> {
+
+template<size_t n_dims, typename IdxLin>
+struct stride_calculator<n_dims, n_dims, IdxLin>
+{
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       IdxLin cur_stride,
       IdxLin const (&)[n_dims]) const
@@ -68,44 +70,43 @@ struct stride_calculator<n_dims, n_dims, IdxLin> {
   }
 };
 
-template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
-struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
+template<camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
+struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>
+{
 public:
   using IndexLinear = IdxLin;
-  using IndexRange = camp::make_idx_seq_t<sizeof...(RangeInts)>;
+  using IndexRange  = camp::make_idx_seq_t<sizeof...(RangeInts)>;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  static constexpr IdxLin limit = RAJA::operators::limits<IdxLin>::max();
+  static constexpr IdxLin limit  = RAJA::operators::limits<IdxLin>::max();
   static constexpr ptrdiff_t stride_one_dim = StrideOneDim;
 
-  IdxLin sizes[n_dims] = {0};
-  IdxLin strides[n_dims] = {0};
+  IdxLin sizes[n_dims]       = {0};
+  IdxLin strides[n_dims]     = {0};
   IdxLin inv_strides[n_dims] = {0};
-  IdxLin inv_mods[n_dims] = {0};
+  IdxLin inv_mods[n_dims]    = {0};
 
 
   /*!
    * Default constructor with zero sizes and strides.
    */
-  constexpr RAJA_INLINE LayoutBase_impl() = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const &) = default;
-  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl &&) = default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl const &) =
-      default;
-  RAJA_INLINE LayoutBase_impl &operator=(LayoutBase_impl &&) =
-      default;
+  constexpr RAJA_INLINE LayoutBase_impl()                        = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl const&)  = default;
+  constexpr RAJA_INLINE LayoutBase_impl(LayoutBase_impl&&)       = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl const&) = default;
+  RAJA_INLINE LayoutBase_impl& operator=(LayoutBase_impl&&)      = default;
 
   /*!
    * Construct a layout given the size of each dimension.
    */
-  template <typename... Types>
+  template<typename... Types>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr LayoutBase_impl(Types... ns)
-      : sizes{static_cast<IdxLin>(stripIndexType(ns))...},
-        strides{(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin>{}(
+      : sizes {static_cast<IdxLin>(stripIndexType(ns))...},
+        strides {(detail::stride_calculator<RangeInts + 1, n_dims, IdxLin> {}(
             sizes[RangeInts] ? IdxLin(1) : IdxLin(0),
             sizes))...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
   {
     static_assert(n_dims == sizeof...(Types),
                   "number of dimensions must match");
@@ -114,31 +115,29 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   /*!
    *  Templated copy ctor from simillar layout.
    */
-  template <typename CIdxLin, ptrdiff_t CStrideOneDim>
-  constexpr RAJA_INLINE RAJA_HOST_DEVICE LayoutBase_impl(
-      const LayoutBase_impl<camp::idx_seq<RangeInts...>, CIdxLin, CStrideOneDim>
-          &rhs)
-      : sizes{static_cast<IdxLin>(rhs.sizes[RangeInts])...},
-        strides{static_cast<IdxLin>(rhs.strides[RangeInts])...},
-        inv_strides{static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
-        inv_mods{static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
-  {
-  }
-
+  template<typename CIdxLin, ptrdiff_t CStrideOneDim>
+  constexpr RAJA_INLINE RAJA_HOST_DEVICE
+  LayoutBase_impl(const LayoutBase_impl<camp::idx_seq<RangeInts...>,
+                                        CIdxLin,
+                                        CStrideOneDim>& rhs)
+      : sizes {static_cast<IdxLin>(rhs.sizes[RangeInts])...},
+        strides {static_cast<IdxLin>(rhs.strides[RangeInts])...},
+        inv_strides {static_cast<IdxLin>(rhs.inv_strides[RangeInts])...},
+        inv_mods {static_cast<IdxLin>(rhs.inv_mods[RangeInts])...}
+  {}
 
   /*!
    *  Construct a Layout given the size and stride of each dimension
    */
-  template <typename... Types>
+  template<typename... Types>
   RAJA_INLINE constexpr LayoutBase_impl(
-      const std::array<IdxLin, n_dims> &sizes_in,
-      const std::array<IdxLin, n_dims> &strides_in)
-      : sizes{sizes_in[RangeInts]...},
-        strides{strides_in[RangeInts]...},
-        inv_strides{(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
-        inv_mods{(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
-  {
-  }
+      const std::array<IdxLin, n_dims>& sizes_in,
+      const std::array<IdxLin, n_dims>& strides_in)
+      : sizes {sizes_in[RangeInts]...},
+        strides {strides_in[RangeInts]...},
+        inv_strides {(strides[RangeInts] ? strides[RangeInts] : IdxLin(1))...},
+        inv_mods {(sizes[RangeInts] ? sizes[RangeInts] : IdxLin(1))...}
+  {}
 
   /*!
    * Methods to performs bounds checking in layout objects
@@ -147,20 +146,20 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheckError(Idx idx) const
   {
     printf("Error at index %d, value %ld is not within bounds [0, %ld] \n",
-           static_cast<int>(N), static_cast<long int>(idx), static_cast<long int>(sizes[N] - 1));
+           static_cast<int>(N), static_cast<long int>(idx),
+           static_cast<long int>(sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
-  template <camp::idx_t N>
+  template<camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
-  template <camp::idx_t N, typename Idx, typename... Indices>
+  template<camp::idx_t N, typename Idx, typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
                                                 Indices... indices) const
   {
-    if(sizes[N] > 0 && !(0<=idx && idx < static_cast<Idx>(sizes[N])))
+    if (sizes[N] > 0 && !(0 <= idx && idx < static_cast<Idx>(sizes[N])))
     {
       BoundsCheckError<N>(idx);
     }
@@ -176,23 +175,22 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    * @return Linear space index.
    */
 
-  template <typename... Indices>
+  template<typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
   operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     // dot product of strides and indices
-    return sum<IdxLin>(
-      (RangeInts==stride_one_dim ?   // Is this dimension stride-one?
-         indices :  // it's stride one, so dont bother with multiply
-         strides[RangeInts]*indices // it's not stride one
-			)...
-    );
+    return sum<IdxLin>((RangeInts == stride_one_dim
+                            ?  // Is this dimension stride-one?
+                            indices
+                            :  // it's stride one, so dont bother with multiply
+                            strides[RangeInts] * indices  // it's not stride one
+                        )...);
   }
 
-
   /*!
    * Given a linear-space index, compute the n-dimensional indices defined
    * by this layout.
@@ -203,22 +201,24 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
    * @param indices  Variadic list of indices to be assigned, number must match
    *                 dimensionality of this layout.
    */
-  template <typename... Indices>
+  template<typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
 #if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     IdxLin totSize = size_noproj();
-    if(totSize > 0 && (linear_index < 0 || linear_index >= totSize)) {
+    if (totSize > 0 && (linear_index < 0 || linear_index >= totSize))
+    {
       printf("Error! Linear index %ld is not within bounds [0, %ld]. \n",
-             static_cast<long int>(linear_index), static_cast<long int>(totSize-1));
+             static_cast<long int>(linear_index),
+             static_cast<long int>(totSize - 1));
       RAJA_ABORT_OR_THROW("Out of bounds error \n");
-     }
+    }
 #endif
 
-    camp::sink((indices =
-      (camp::decay<Indices>)((linear_index / inv_strides[RangeInts]) %
-                             inv_mods[RangeInts]))...);
+    camp::sink((indices = (camp::decay<Indices>)((linear_index /
+                                                  inv_strides[RangeInts]) %
+                                                 inv_mods[RangeInts]))...);
   }
 
   /*!
@@ -231,8 +231,9 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   {
     // Multiply together all of the sizes,
     // replacing 1 for any zero-sized dimensions
-    return foldl(RAJA::operators::multiplies<IdxLin>(),
-                         (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
+    return foldl(
+        RAJA::operators::multiplies<IdxLin>(),
+        (sizes[RangeInts] == IdxLin(0) ? IdxLin(1) : sizes[RangeInts])...);
   }
 
   /*!
@@ -248,34 +249,28 @@ struct LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim> {
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return strides[DIM];
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return sizes[DIM];
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 };
 
-template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
+template<camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
 constexpr size_t
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::n_dims;
-template <camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
+template<camp::idx_t... RangeInts, typename IdxLin, ptrdiff_t StrideOneDim>
 constexpr IdxLin
     LayoutBase_impl<camp::idx_seq<RangeInts...>, IdxLin, StrideOneDim>::limit;
 }  // namespace detail
@@ -329,26 +324,26 @@ constexpr IdxLin
  *     layout.toIndices(lin2, i, j, k); // i,j,k = {0, 0, 1}
  *
  */
-template <size_t n_dims, typename IdxLin = Index_type, ptrdiff_t StrideOne = -1>
+template<size_t n_dims, typename IdxLin = Index_type, ptrdiff_t StrideOne = -1>
 using Layout =
     detail::LayoutBase_impl<camp::make_idx_seq_t<n_dims>, IdxLin, StrideOne>;
 
-template <typename IdxLin, typename DimTuple, ptrdiff_t StrideOne = -1>
+template<typename IdxLin, typename DimTuple, ptrdiff_t StrideOne = -1>
 struct TypedLayout;
 
-template <typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
+template<typename IdxLin, typename... DimTypes, ptrdiff_t StrideOne>
 struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
-    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne> {
+    : public Layout<sizeof...(DimTypes), strip_index_type_t<IdxLin>, StrideOne>
+{
 
   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-  using Self = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
-  using Base = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
+  using Self   = TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>;
+  using Base   = Layout<sizeof...(DimTypes), StrippedIdxLin, StrideOne>;
   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
 
   // Pull in base constructors
   using Base::Base;
 
-
   /*!
    * Computes a linear space index from specified indices.
    * This is formed by the dot product of the indices and the layout strides.
@@ -362,7 +357,6 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
-
   /*!
    * Given a linear-space index, compute the n-dimensional indices defined
    * by this layout.
@@ -374,11 +368,11 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    *                 dimensionality of this layout.
    */
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
@@ -389,41 +383,40 @@ struct TypedLayout<IdxLin, camp::tuple<DimTypes...>, StrideOne>
    * result to typed indices
    *
    */
-  template <typename... Indices, camp::idx_t... RangeInts>
+  template<typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-		camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
-
 /*!
  * Convert a non-stride-one Layout to a stride-1 Layout
  *
  */
-template <ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
+template<ptrdiff_t s1_dim, size_t n_dims, typename IdxLin>
 RAJA_INLINE Layout<n_dims, IdxLin, s1_dim> make_stride_one(
-    Layout<n_dims, IdxLin> const &l)
+    Layout<n_dims, IdxLin> const& l)
 {
   return Layout<n_dims, IdxLin, s1_dim>(l);
 }
 
-
 /*!
  * Convert a non-stride-one TypedLayout to a stride-1 TypedLayout
  *
  */
-template <ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
+template<ptrdiff_t s1_dim, typename IdxLin, typename IdxTuple>
 RAJA_INLINE TypedLayout<IdxLin, IdxTuple, s1_dim> make_stride_one(
-    TypedLayout<IdxLin, IdxTuple> const &l)
+    TypedLayout<IdxLin, IdxTuple> const& l)
 {
   // strip l to it's base-class type
-  using Base = typename TypedLayout<IdxLin, IdxTuple>::Base;
-  Base const &b = (Base const &)l;
+  using Base    = typename TypedLayout<IdxLin, IdxTuple>::Base;
+  Base const& b = (Base const&)l;
 
   // Use non-typed layout to initialize new typed layout
   return TypedLayout<IdxLin, IdxTuple, s1_dim>(b);
diff --git a/include/RAJA/util/LocalArray.hpp b/include/RAJA/util/LocalArray.hpp
index 50680101d4..db6dd1d042 100644
--- a/include/RAJA/util/LocalArray.hpp
+++ b/include/RAJA/util/LocalArray.hpp
@@ -31,8 +31,7 @@ namespace RAJA
 {
 
 
-
-template<camp::idx_t ... Sizes>
+template<camp::idx_t... Sizes>
 using ParamList = camp::idx_seq<Sizes...>;
 
 /*!
@@ -51,79 +50,85 @@ using ParamList = camp::idx_seq<Sizes...>;
  */
 
 
-namespace internal {
-
-
-
-  template<typename Perm, typename Sizes>
-  struct StaticLayoutHelper;
+namespace internal
+{
 
-  template<camp::idx_t ... Perm, Index_type ...Sizes>
-  struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>{
-      using type =  StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
-  };
 
-  template<typename Perm, typename Sizes>
-  using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
+template<typename Perm, typename Sizes>
+struct StaticLayoutHelper;
 
+template<camp::idx_t... Perm, Index_type... Sizes>
+struct StaticLayoutHelper<camp::idx_seq<Perm...>, SizeList<Sizes...>>
+{
+  using type = StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
+};
 
+template<typename Perm, typename Sizes>
+using getStaticLayoutType = typename StaticLayoutHelper<Perm, Sizes>::type;
 
-}
 
+}  // namespace internal
 
-template<typename ValueType, typename Perm, typename Sizes, typename... IndexTypes>
+template<typename ValueType,
+         typename Perm,
+         typename Sizes,
+         typename... IndexTypes>
 using TypedLocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, camp::list<IndexTypes...> >;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            camp::list<IndexTypes...>>;
 
 
 template<typename ValueType, typename Perm, typename Sizes>
 using LocalArray =
-    internal::TypedViewBase<ValueType, ValueType *, internal::getStaticLayoutType<Perm, Sizes>, internal::getDefaultIndexTypes<Perm> >;
-
-
-
-
-
-template<typename AtomicPolicy, typename DataType, typename Perm,
-         typename Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray {
-};
-
-template<typename AtomicPolicy, typename DataType, camp::idx_t ... Perm,
-          Index_type ... Sizes, typename ... IndexTypes>
-struct AtomicTypedLocalArray<AtomicPolicy, DataType, camp::idx_seq<Perm ...>,
-                             RAJA::SizeList<Sizes ...>, IndexTypes ...>{
-  DataType *m_arrayPtr = nullptr;
-  using value_type = DataType;
-  using atomic_ref_t = RAJA::AtomicRef<value_type, AtomicPolicy>;
-  using layout_type = RAJA::StaticLayout<camp::idx_seq<Perm ...>, Sizes ...>;
+    internal::TypedViewBase<ValueType,
+                            ValueType*,
+                            internal::getStaticLayoutType<Perm, Sizes>,
+                            internal::getDefaultIndexTypes<Perm>>;
+
+template<typename AtomicPolicy,
+         typename DataType,
+         typename Perm,
+         typename Sizes,
+         typename... IndexTypes>
+struct AtomicTypedLocalArray
+{};
+
+template<typename AtomicPolicy,
+         typename DataType,
+         camp::idx_t... Perm,
+         Index_type... Sizes,
+         typename... IndexTypes>
+struct AtomicTypedLocalArray<AtomicPolicy,
+                             DataType,
+                             camp::idx_seq<Perm...>,
+                             RAJA::SizeList<Sizes...>,
+                             IndexTypes...>
+{
+  DataType* m_arrayPtr = nullptr;
+  using value_type     = DataType;
+  using atomic_ref_t   = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using layout_type    = RAJA::StaticLayout<camp::idx_seq<Perm...>, Sizes...>;
   static const camp::idx_t NumElem = layout_type::s_size;
 
   RAJA_HOST_DEVICE
-  atomic_ref_t operator()(IndexTypes ... indices) const
+  atomic_ref_t operator()(IndexTypes... indices) const
   {
-    return(atomic_ref_t(&m_arrayPtr[layout_type::s_oper(stripIndexType(indices)
-                                                     ...)]));
+    return (atomic_ref_t(
+        &m_arrayPtr[layout_type::s_oper(stripIndexType(indices)...)]));
   }
 
   RAJA_HOST_DEVICE
+
   RAJA_INLINE
-  constexpr
-  camp::idx_t size() const
-  {
-    return layout_type::s_size;
-  }
+  constexpr camp::idx_t size() const { return layout_type::s_size; }
 
   RAJA_HOST_DEVICE
-  RAJA_INLINE void set_data(DataType * data_ptr){
-    m_arrayPtr = data_ptr;
-  }
+  RAJA_INLINE void set_data(DataType* data_ptr) { m_arrayPtr = data_ptr; }
 };
 
 
-
-
-
 }  // end namespace RAJA
 
 
diff --git a/include/RAJA/util/OffsetLayout.hpp b/include/RAJA/util/OffsetLayout.hpp
index 827515062e..9c96f1a3a9 100644
--- a/include/RAJA/util/OffsetLayout.hpp
+++ b/include/RAJA/util/OffsetLayout.hpp
@@ -37,38 +37,39 @@ namespace RAJA
 namespace internal
 {
 
-template <typename Range, typename IdxLin>
+template<typename Range, typename IdxLin>
 struct OffsetLayout_impl;
 
-template <camp::idx_t... RangeInts, typename IdxLin>
-struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
-  using Self = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
-  using IndexRange = camp::idx_seq<RangeInts...>;
+template<camp::idx_t... RangeInts, typename IdxLin>
+struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>
+{
+  using Self        = OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin>;
+  using IndexRange  = camp::idx_seq<RangeInts...>;
   using IndexLinear = IdxLin;
-  using Base = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
+  using Base        = RAJA::detail::LayoutBase_impl<IndexRange, IdxLin>;
   Base base_;
 
   static constexpr camp::idx_t stride_one_dim = Base::stride_one_dim;
 
   static constexpr size_t n_dims = sizeof...(RangeInts);
-  IdxLin offsets[n_dims]={0}; //If not specified set to zero
+  IdxLin offsets[n_dims]         = {0};  // If not specified set to zero
 
   constexpr RAJA_INLINE OffsetLayout_impl(
       std::array<IdxLin, sizeof...(RangeInts)> begin,
       std::array<IdxLin, sizeof...(RangeInts)> end)
-      : base_{(end[RangeInts] - begin[RangeInts])...},
-        offsets{begin[RangeInts]...}
-  {
-  }
+      : base_ {(end[RangeInts] - begin[RangeInts])...},
+        offsets {begin[RangeInts]...}
+  {}
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout_impl(Self const& c)
-      : base_(c.base_), offsets{c.offsets[RangeInts]...}
-  {
-  }
+      : base_(c.base_),
+        offsets {c.offsets[RangeInts]...}
+  {}
 
   void shift(std::array<IdxLin, sizeof...(RangeInts)> shift)
   {
-    for(size_t i=0; i<n_dims; ++i) offsets[i] += shift[i];
+    for (size_t i = 0; i < n_dims; ++i)
+      offsets[i] += shift[i];
   }
 
   template<camp::idx_t N, typename Idx>
@@ -76,39 +77,40 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   {
     printf("Error at index %d, value %ld is not within bounds [%ld, %ld] \n",
            static_cast<int>(N), static_cast<long int>(idx),
-           static_cast<long int>(offsets[N]), static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
+           static_cast<long int>(offsets[N]),
+           static_cast<long int>(offsets[N] + base_.sizes[N] - 1));
     RAJA_ABORT_OR_THROW("Out of bounds error \n");
   }
 
-  template <camp::idx_t N>
+  template<camp::idx_t N>
   RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck() const
-  {
-  }
+  {}
 
-  template <camp::idx_t N, typename Idx, typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx, Indices... indices) const
+  template<camp::idx_t N, typename Idx, typename... Indices>
+  RAJA_INLINE RAJA_HOST_DEVICE void BoundsCheck(Idx idx,
+                                                Indices... indices) const
   {
-    if(!(offsets[N] <=idx && idx < offsets[N] + base_.sizes[N]))
+    if (!(offsets[N] <= idx && idx < offsets[N] + base_.sizes[N]))
     {
       BoundsCheckError<N>(idx);
     }
     RAJA_UNUSED_VAR(idx);
-    BoundsCheck<N+1>(indices...);
+    BoundsCheck<N + 1>(indices...);
   }
 
-  template <typename... Indices>
-  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin operator()(
-      Indices... indices) const
+  template<typename... Indices>
+  RAJA_INLINE RAJA_HOST_DEVICE RAJA_BOUNDS_CHECK_constexpr IdxLin
+  operator()(Indices... indices) const
   {
-#if defined (RAJA_BOUNDS_CHECK_INTERNAL)
+#if defined(RAJA_BOUNDS_CHECK_INTERNAL)
     BoundsCheck<0>(indices...);
 #endif
     return base_((indices - offsets[RangeInts])...);
   }
 
-  template <typename... Indices>
+  template<typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              Indices &&... indices) const
+                                              Indices&&... indices) const
   {
     base_.toIndices(linear_index, std::forward<Indices>(indices)...);
     camp::sink((indices = (offsets[RangeInts] + indices))...);
@@ -119,16 +121,15 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
       const std::array<IdxLin, sizeof...(RangeInts)>& offsets_in,
       const Layout<sizeof...(RangeInts), IdxLin>& rhs)
   {
-    OffsetLayout_impl ret{rhs};
+    OffsetLayout_impl ret {rhs};
     camp::sink((ret.offsets[RangeInts] = offsets_in[RangeInts])...);
     return ret;
   }
 
   constexpr RAJA_INLINE RAJA_HOST_DEVICE
   OffsetLayout_impl(const Layout<sizeof...(RangeInts), IdxLin>& rhs)
-      : base_{rhs}
-  {
-  }
+      : base_ {rhs}
+  {}
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin size() const
   {
@@ -141,35 +142,30 @@ struct OffsetLayout_impl<camp::idx_seq<RangeInts...>, IdxLin> {
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return base_.get_dim_stride();
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return base_.get_dim_size();
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return offsets[DIM];
   }
 };
 
 }  // namespace internal
 
-template <size_t n_dims = 1, typename IdxLin = Index_type>
+template<size_t n_dims = 1, typename IdxLin = Index_type>
 struct OffsetLayout
-    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin> {
+    : public internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>
+{
   using Base =
       internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>;
 
@@ -179,76 +175,77 @@ struct OffsetLayout
   constexpr RAJA_INLINE RAJA_HOST_DEVICE OffsetLayout(
       const internal::OffsetLayout_impl<camp::make_idx_seq_t<n_dims>, IdxLin>&
           rhs)
-      : Base{rhs}
-  {
-  }
+      : Base {rhs}
+  {}
 };
 
-//TypedOffsetLayout
-template <typename IdxLin, typename DimTuple>
+// TypedOffsetLayout
+template<typename IdxLin, typename DimTuple>
 struct TypedOffsetLayout;
 
-template <typename IdxLin, typename... DimTypes>
+template<typename IdxLin, typename... DimTypes>
 struct TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>
-: public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
+    : public OffsetLayout<sizeof...(DimTypes), strip_index_type_t<IdxLin>>
 {
-   using StrippedIdxLin = strip_index_type_t<IdxLin>;
-   using Self = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
-   using Base = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
-   using DimArr = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
-   using DimTuple = camp::tuple<DimTypes...>;
-   using IndexLinear = IdxLin;
-
-   // Pull in base coonstructors
- #if 0
+  using StrippedIdxLin = strip_index_type_t<IdxLin>;
+  using Self           = TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+  using Base           = OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>;
+  using DimArr         = std::array<StrippedIdxLin, sizeof...(DimTypes)>;
+  using DimTuple       = camp::tuple<DimTypes...>;
+  using IndexLinear    = IdxLin;
+
+  // Pull in base coonstructors
+#if 0
    // This breaks with nvcc11
  using Base::Base;
- #else
-   using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
- #endif
+#else
+  using OffsetLayout<sizeof...(DimTypes), StrippedIdxLin>::OffsetLayout;
+#endif
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(DimTypes... indices) const
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
+      DimTypes... indices) const
   {
     return IdxLin(Base::operator()(stripIndexType(indices)...));
   }
 
   RAJA_INLINE RAJA_HOST_DEVICE void toIndices(IdxLin linear_index,
-                                              DimTypes &... indices) const
+                                              DimTypes&... indices) const
   {
-    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)>{},
+    toIndicesHelper(camp::make_idx_seq_t<sizeof...(DimTypes)> {},
                     std::forward<IdxLin>(linear_index),
-                    std::forward<DimTypes &>(indices)...);
+                    std::forward<DimTypes&>(indices)...);
   }
 
 private:
-  template <typename... Indices, camp::idx_t... RangeInts>
+  template<typename... Indices, camp::idx_t... RangeInts>
   RAJA_INLINE RAJA_HOST_DEVICE void toIndicesHelper(camp::idx_seq<RangeInts...>,
                                                     IdxLin linear_index,
-                                                    Indices &... indices) const
+                                                    Indices&... indices) const
   {
     StrippedIdxLin locals[sizeof...(DimTypes)];
     Base::toIndices(stripIndexType(linear_index), locals[RangeInts]...);
-    camp::sink((indices = Indices{static_cast<Indices>(locals[RangeInts])})...);
+    camp::sink(
+        (indices = Indices {static_cast<Indices>(locals[RangeInts])})...);
   }
 };
 
-
-template <size_t n_dims, typename IdxLin = Index_type>
+template<size_t n_dims, typename IdxLin = Index_type>
 auto make_offset_layout(const std::array<IdxLin, n_dims>& begin,
                         const std::array<IdxLin, n_dims>& end)
     -> OffsetLayout<n_dims, IdxLin>
 {
-  return OffsetLayout<n_dims, IdxLin>{begin, end};
+  return OffsetLayout<n_dims, IdxLin> {begin, end};
 }
 
-template <size_t Rank, typename IdxLin = Index_type>
+template<size_t Rank, typename IdxLin = Index_type>
 auto make_permuted_offset_layout(const std::array<IdxLin, Rank>& begin,
                                  const std::array<IdxLin, Rank>& end,
                                  const std::array<IdxLin, Rank>& permutation)
     -> decltype(make_offset_layout<Rank, IdxLin>(begin, end))
 {
   std::array<IdxLin, Rank> sizes;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     sizes[i] = end[i] - begin[i];
   }
   return internal::OffsetLayout_impl<camp::make_idx_seq_t<Rank>, IdxLin>::
diff --git a/include/RAJA/util/OffsetOperators.hpp b/include/RAJA/util/OffsetOperators.hpp
index 150aaeee34..f9e7611d95 100644
--- a/include/RAJA/util/OffsetOperators.hpp
+++ b/include/RAJA/util/OffsetOperators.hpp
@@ -26,55 +26,70 @@
 namespace RAJA
 {
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetLeft
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template<typename new_Ret,
+           typename new_Arg1 = new_Ret,
+           typename new_Arg2 = new_Ret>
   using rebind = GetOffsetLeft<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template<size_t>
   using rebunch = GetOffsetLeft<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& num_i,
-                 Arg2 const& j, Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& num_i,
+      Arg2 const& j,
+      Arg2 const& RAJA_UNUSED_ARG(num_j)) const noexcept
   {
     return i + j * num_i;
   }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct GetOffsetRight
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
+  template<typename new_Ret,
+           typename new_Arg1 = new_Ret,
+           typename new_Arg2 = new_Ret>
   using rebind = GetOffsetRight<new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t >
+  template<size_t>
   using rebunch = GetOffsetRight<Ret, Arg1, Arg2>;
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     return i * num_j + j;
   }
 };
 
-template <size_t t_bunch_num_i,
-          typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<size_t t_bunch_num_i,
+         typename Ret,
+         typename Arg1 = Ret,
+         typename Arg2 = Arg1>
 struct GetOffsetLeftBunched
 {
-  template < typename new_Ret, typename new_Arg1 = new_Ret, typename new_Arg2 = new_Ret>
-  using rebind = GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
+  template<typename new_Ret,
+           typename new_Arg1 = new_Ret,
+           typename new_Arg2 = new_Ret>
+  using rebind =
+      GetOffsetLeftBunched<t_bunch_num_i, new_Ret, new_Arg1, new_Arg2>;
 
-  template < size_t new_bunch_num_i >
+  template<size_t new_bunch_num_i>
   using rebunch = GetOffsetLeftBunched<new_bunch_num_i, Ret, Arg1, Arg2>;
 
-  static constexpr Arg1 bunch_num_i{t_bunch_num_i};
+  static constexpr Arg1 bunch_num_i {t_bunch_num_i};
 
-  RAJA_INLINE RAJA_HOST_DEVICE constexpr
-  Ret operator()(Arg1 const& i, Arg1 const& RAJA_UNUSED_ARG(num_i),
-                 Arg2 const& j, Arg2 const& num_j) const noexcept
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr Ret operator()(
+      Arg1 const& i,
+      Arg1 const& RAJA_UNUSED_ARG(num_i),
+      Arg2 const& j,
+      Arg2 const& num_j) const noexcept
   {
     // assert(num_i >= bunch_num_i)
     Arg1 i_inner = i % bunch_num_i;
diff --git a/include/RAJA/util/Operators.hpp b/include/RAJA/util/Operators.hpp
index b4249e7182..b4df6bc93e 100644
--- a/include/RAJA/util/Operators.hpp
+++ b/include/RAJA/util/Operators.hpp
@@ -43,49 +43,54 @@ namespace detail
 {
 
 // truly associative (does not include fp add/multiply)
-struct associative_tag {
-};
+struct associative_tag
+{};
 
 // associative up to floating point rounding differences
-struct fp_associative_tag : associative_tag {
-};
+struct fp_associative_tag : associative_tag
+{};
 
 // get associativity tag appropriate for the type
-template < typename T >
+template<typename T>
 using associative_or_fp_associative_tag =
-  std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
-                     fp_associative_tag, associative_tag>;
+    std::conditional_t<std::is_floating_point<std::decay_t<T>>::value,
+                       fp_associative_tag,
+                       associative_tag>;
 
-template <typename Arg1, typename Arg2, typename Result>
-struct binary_function {
-  using first_argument_type = Arg1;
+template<typename Arg1, typename Arg2, typename Result>
+struct binary_function
+{
+  using first_argument_type  = Arg1;
   using second_argument_type = Arg2;
-  using result_type = Result;
+  using result_type          = Result;
 };
 
-template <typename Argument, typename Result>
-struct unary_function {
+template<typename Argument, typename Result>
+struct unary_function
+{
   using argument_type = Argument;
-  using result_type = Result;
+  using result_type   = Result;
 };
 
-template <typename Arg1, typename Arg2>
-struct comparison_function : public binary_function<Arg1, Arg2, bool> {
-};
+template<typename Arg1, typename Arg2>
+struct comparison_function : public binary_function<Arg1, Arg2, bool>
+{};
 
 }  // namespace detail
 
 namespace types
 {
 
-template <typename T>
-struct is_unsigned_int {
+template<typename T>
+struct is_unsigned_int
+{
   static constexpr const bool value =
       std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
 
-template <typename T>
-struct is_signed_int {
+template<typename T>
+struct is_signed_int
+{
   static constexpr const bool value =
       !std::is_unsigned<T>::value && std::is_integral<T>::value;
 };
@@ -95,79 +100,92 @@ struct is_signed_int {
         \brief type lookup to return the next largest similar type (or the same
    type)
 */
-template <typename T, bool GPU = false>
-struct larger {
-};
+template<typename T, bool GPU = false>
+struct larger
+{};
 
-template <>
-struct larger<uint8_t> {
+template<>
+struct larger<uint8_t>
+{
   using type = uint16_t;
 };
 
-template <>
-struct larger<uint16_t> {
+template<>
+struct larger<uint16_t>
+{
   using type = uint32_t;
 };
 
-template <>
-struct larger<uint32_t> {
+template<>
+struct larger<uint32_t>
+{
   using type = uint64_t;
 };
 
-template <>
-struct larger<int8_t> {
+template<>
+struct larger<int8_t>
+{
   using type = int16_t;
 };
 
-template <>
-struct larger<int16_t> {
+template<>
+struct larger<int16_t>
+{
   using type = int32_t;
 };
 
-template <>
-struct larger<int32_t> {
+template<>
+struct larger<int32_t>
+{
   using type = int64_t;
 };
 
-template <>
-struct larger<float> {
+template<>
+struct larger<float>
+{
   using type = double;
 };
 
-template <>
-struct larger<double> {
+template<>
+struct larger<double>
+{
   using type = long double;
 };
 
-template <>
-struct larger<double, true> {
+template<>
+struct larger<double, true>
+{
   using type = double;
 };
 
 namespace detail
 {
 
-template <typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
-struct largest {
-};
+template<typename T, bool isInt, bool isSigned, bool isFP, bool gpu = false>
+struct largest
+{};
 
-template <typename T>
-struct largest<T, true, false, false> {
+template<typename T>
+struct largest<T, true, false, false>
+{
   using type = uint64_t;
 };
 
-template <typename T>
-struct largest<T, true, true, false> {
+template<typename T>
+struct largest<T, true, true, false>
+{
   using type = int64_t;
 };
 
-template <typename T>
-struct largest<T, false, false, true, false> {
+template<typename T>
+struct largest<T, false, false, true, false>
+{
   using type = long double;
 };
 
-template <typename T>
-struct largest<T, false, false, true, true> {
+template<typename T>
+struct largest<T, false, false, true, true>
+{
   using type = double;
 };
 }  // namespace detail
@@ -176,8 +194,9 @@ struct largest<T, false, false, true, true> {
         \brief type lookup to return largest similar type. If running on GPU,
    pass 'true' as second template argument
 */
-template <typename T, bool gpu = false>
-struct largest {
+template<typename T, bool gpu = false>
+struct largest
+{
   using type = typename detail::largest<T,
                                         std::is_integral<T>::value,
                                         std::is_signed<T>::value,
@@ -185,32 +204,38 @@ struct largest {
                                         gpu>::type;
 };
 
-
-template <typename T>
-struct size_of {
-  enum { value = sizeof(T) };
+template<typename T>
+struct size_of
+{
+  enum
+  {
+    value = sizeof(T)
+  };
 };
 
 namespace detail
 {
 
-template <typename T, typename U, bool lhsLarger>
-struct larger_of {
-};
+template<typename T, typename U, bool lhsLarger>
+struct larger_of
+{};
 
-template <typename T, typename U>
-struct larger_of<T, U, true> {
+template<typename T, typename U>
+struct larger_of<T, U, true>
+{
   using type = T;
 };
 
-template <typename T, typename U>
-struct larger_of<T, U, false> {
+template<typename T, typename U>
+struct larger_of<T, U, false>
+{
   using type = U;
 };
 }  // namespace detail
 
-template <typename T, typename U>
-struct larger_of {
+template<typename T, typename U>
+struct larger_of
+{
   using type = typename detail::
       larger_of<T, U, (size_of<T>::value > size_of<U>::value)>::type;
 };
@@ -218,92 +243,88 @@ struct larger_of {
 }  // namespace types
 
 
-
-template <typename T, typename Enable = void>
+template<typename T, typename Enable = void>
 struct limits;
 
-
 // limits for signed integer types
-template <typename T>
+template<typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  !std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      !std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
-    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu) );
+    return static_cast<T>(1llu << ((8llu * sizeof(T)) - 1llu));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
+
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(~(1llu << ((8llu * sizeof(T)) - 1llu)));
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
 
 // limits for signed integer types
-template <typename T>
+template<typename T>
 struct limits<T,
-  typename std::enable_if<std::is_integral<T>::value &&
-  std::is_unsigned<T>::value>::type>
+              typename std::enable_if<std::is_integral<T>::value &&
+                                      std::is_unsigned<T>::value>::type>
 {
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T min()
   {
     return static_cast<T>(0);
   }
+
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr T max()
   {
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4309 )
+#pragma warning(disable : 4309)
 #endif
     return static_cast<T>(0xFFFFFFFFFFFFFFFF);
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4309 )
+#pragma warning(default : 4309)
 #endif
   }
 };
 
+template<>
+struct limits<float>
+{
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min() { return -FLT_MAX; }
 
-template <>
-struct limits<float> {
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float min()
-  {
-    return -FLT_MAX;
-  }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max()
-  {
-    return FLT_MAX;
-  }
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr float max() { return FLT_MAX; }
 };
 
-template <>
-struct limits<double> {
+template<>
+struct limits<double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr double min()
   {
     return -DBL_MAX;
   }
-  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() 
-  { 
-     return DBL_MAX; 
-  }
+
+  RAJA_INLINE RAJA_HOST_DEVICE static constexpr double max() { return DBL_MAX; }
 };
 
-template <>
-struct limits<long double> {
+template<>
+struct limits<long double>
+{
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double min()
   {
     return -LDBL_MAX;
   }
+
   RAJA_INLINE RAJA_HOST_DEVICE static constexpr long double max()
   {
     return LDBL_MAX;
@@ -312,12 +333,13 @@ struct limits<long double> {
 
 
 #if defined(RAJA_CHECK_LIMITS)
-template <typename T>
+template<typename T>
 constexpr bool check()
 {
   return limits<T>::min() == std::numeric_limits<T>::min() &&
          limits<T>::max() == std::numeric_limits<T>::max();
 }
+
 static_assert(check<char>(), "limits for char is broken");
 static_assert(check<unsigned char>(), "limits for unsigned char is broken");
 static_assert(check<short>(), "limits for short is broken");
@@ -336,82 +358,94 @@ static_assert(check<unsigned long long>(),
 
 // Arithmetic
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct plus : public detail::binary_function<Arg1, Arg2, Ret>,
-              detail::associative_or_fp_associative_tag<Ret> {
+              detail::associative_or_fp_associative_tag<Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} + rhs;
+    return Ret {lhs} + rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct minus : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct minus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} - rhs;
+    return Ret {lhs} - rhs;
   }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct multiplies : public detail::binary_function<Arg1, Arg2, Ret>,
-                    detail::associative_or_fp_associative_tag<Ret> {
+                    detail::associative_or_fp_associative_tag<Ret>
+{
 
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} * rhs;
+    return Ret {lhs} * rhs;
   }
-  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{1}; }
+
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {1}; }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct divides : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct divides : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} / rhs;
+    return Ret {lhs} / rhs;
   }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct modulus : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct modulus : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
-    return Ret{lhs} % rhs;
+    return Ret {lhs} % rhs;
   }
 };
 
 // Conditions
 
-template <typename Arg1, typename Arg2 = Arg1>
+template<typename Arg1, typename Arg2 = Arg1>
 struct logical_and : public detail::comparison_function<Arg1, Arg2>,
-                     detail::associative_tag {
+                     detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
     return lhs && rhs;
   }
+
   RAJA_HOST_DEVICE static constexpr bool identity() { return true; }
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
+template<typename Arg1, typename Arg2 = Arg1>
 struct logical_or : public detail::comparison_function<Arg1, Arg2>,
-                    detail::associative_tag {
+                    detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
     return lhs || rhs;
   }
+
   RAJA_HOST_DEVICE static constexpr bool identity() { return false; }
 };
 
-template <typename T>
-struct logical_not : public detail::unary_function<T, bool> {
+template<typename T>
+struct logical_not : public detail::unary_function<T, bool>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const T& lhs) const
   {
     return !lhs;
@@ -420,31 +454,33 @@ struct logical_not : public detail::unary_function<T, bool> {
 
 // Bitwise
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_or : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct bit_or : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs | rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return Ret {0}; }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_and : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct bit_and : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return lhs & rhs;
   }
 
-RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret{0}; }
+  RAJA_HOST_DEVICE static constexpr Ret identity() { return ~Ret {0}; }
 };
 
-
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
-struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
@@ -459,28 +495,32 @@ struct bit_xor : public detail::binary_function<Arg1, Arg2, Ret> {
  When this operator is used to cycle through an array
  this ensures that the location of the first min/max is kept.
 */
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct minimum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return (rhs < lhs) ? rhs : lhs;
   }
+
   RAJA_HOST_DEVICE static constexpr Ret identity()
   {
     return limits<Ret>::max();
   }
 };
 
-template <typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
+template<typename Ret, typename Arg1 = Ret, typename Arg2 = Arg1>
 struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
-                 detail::associative_tag {
+                 detail::associative_tag
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Arg1& lhs,
                                             const Arg2& rhs) const
   {
     return (lhs < rhs) ? rhs : lhs;
   }
+
   RAJA_HOST_DEVICE static constexpr Ret identity()
   {
     return limits<Ret>::min();
@@ -489,8 +529,9 @@ struct maximum : public detail::binary_function<Arg1, Arg2, Ret>,
 
 // Logical Comparison
 
-template <typename Arg1, typename Arg2 = Arg1>
-struct equal_to : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -498,8 +539,9 @@ struct equal_to : public detail::comparison_function<Arg1, Arg2> {
   }
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
-struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct not_equal_to : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -507,8 +549,9 @@ struct not_equal_to : public detail::comparison_function<Arg1, Arg2> {
   }
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
-struct greater : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct greater : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -516,8 +559,9 @@ struct greater : public detail::comparison_function<Arg1, Arg2> {
   }
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
-struct less : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct less : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -525,9 +569,9 @@ struct less : public detail::comparison_function<Arg1, Arg2> {
   }
 };
 
-
-template <typename Arg1, typename Arg2 = Arg1>
-struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct greater_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -535,8 +579,9 @@ struct greater_equal : public detail::comparison_function<Arg1, Arg2> {
   }
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
-struct less_equal : public detail::comparison_function<Arg1, Arg2> {
+template<typename Arg1, typename Arg2 = Arg1>
+struct less_equal : public detail::comparison_function<Arg1, Arg2>
+{
   RAJA_HOST_DEVICE constexpr bool operator()(const Arg1& lhs,
                                              const Arg2& rhs) const
   {
@@ -546,16 +591,18 @@ struct less_equal : public detail::comparison_function<Arg1, Arg2> {
 
 // Filters
 
-template <typename Ret, typename Orig = Ret>
-struct identity : public detail::unary_function<Orig, Ret> {
+template<typename Ret, typename Orig = Ret>
+struct identity : public detail::unary_function<Orig, Ret>
+{
   RAJA_HOST_DEVICE constexpr Ret operator()(const Orig& lhs) const
   {
     return lhs;
   }
 };
 
-template <typename T, typename U>
-struct project1st : public detail::binary_function<T, U, T> {
+template<typename T, typename U>
+struct project1st : public detail::binary_function<T, U, T>
+{
   RAJA_HOST_DEVICE constexpr T operator()(const T& lhs,
                                           const U& RAJA_UNUSED_ARG(rhs)) const
   {
@@ -563,8 +610,9 @@ struct project1st : public detail::binary_function<T, U, T> {
   }
 };
 
-template <typename T, typename U = T>
-struct project2nd : public detail::binary_function<T, U, U> {
+template<typename T, typename U = T>
+struct project2nd : public detail::binary_function<T, U, U>
+{
   RAJA_HOST_DEVICE constexpr U operator()(const T& RAJA_UNUSED_ARG(lhs),
                                           const U& rhs) const
   {
@@ -574,52 +622,55 @@ struct project2nd : public detail::binary_function<T, U, U> {
 
 // Type Traits
 
-template <typename T>
-struct is_associative {
+template<typename T>
+struct is_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::associative_tag, T>::value;
 };
 
-template <typename T>
-struct is_fp_associative {
+template<typename T>
+struct is_fp_associative
+{
   static constexpr const bool value =
       std::is_base_of<detail::fp_associative_tag, T>::value;
 };
 
-template <typename Arg1, typename Arg2 = Arg1>
+template<typename Arg1, typename Arg2 = Arg1>
 struct safe_plus
     : public plus<Arg1,
                   Arg2,
                   typename types::larger<
-                      typename types::larger_of<Arg1, Arg2>::type>::type> {
-};
+                      typename types::larger_of<Arg1, Arg2>::type>::type>
+{};
 
 }  // namespace operators
 
 namespace concepts
 {
 
-template <typename Function,
-          typename Return,
-          typename Arg1 = Return,
-          typename Arg2 = Arg1>
+template<typename Function,
+         typename Return,
+         typename Arg1 = Return,
+         typename Arg2 = Arg1>
 struct BinaryFunction
     : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>()))) {
-};
+          camp::val<Function>()(camp::val<Arg1>(), camp::val<Arg2>())))
+{};
 
-template <typename Function, typename Return, typename Arg = Return>
+template<typename Function, typename Return, typename Arg = Return>
 struct UnaryFunction : DefineConcept(::RAJA::concepts::convertible_to<Return>(
-                           camp::val<Function>()(camp::val<Arg>()))) {
-};
+                           camp::val<Function>()(camp::val<Arg>())))
+{};
 
 namespace detail
 {
 
-template <typename Fun, typename Ret, typename T, typename U>
-using is_binary_function = ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
+template<typename Fun, typename Ret, typename T, typename U>
+using is_binary_function =
+    ::RAJA::concepts::requires_<BinaryFunction, Ret, T, U>;
 
-template <typename Fun, typename Ret, typename T>
+template<typename Fun, typename Ret, typename T>
 using is_unary_function = ::RAJA::concepts::requires_<UnaryFunction, Ret, T>;
 }  // namespace detail
 
diff --git a/include/RAJA/util/Permutations.hpp b/include/RAJA/util/Permutations.hpp
index e79e9f2830..e5e286215c 100644
--- a/include/RAJA/util/Permutations.hpp
+++ b/include/RAJA/util/Permutations.hpp
@@ -27,50 +27,51 @@
 namespace RAJA
 {
 
-template <typename Indices>
+template<typename Indices>
 struct as_array;
 
-template <camp::idx_t... Indices>
-struct as_array<camp::idx_seq<Indices...>> {
+template<camp::idx_t... Indices>
+struct as_array<camp::idx_seq<Indices...>>
+{
   static constexpr std::array<Index_type, sizeof...(Indices)> get()
   {
     return {{Indices...}};
   }
 };
 
-using PERM_I = camp::idx_seq<0>;
-using PERM_IJ = camp::idx_seq<0, 1>;
-using PERM_JI = camp::idx_seq<1, 0>;
-using PERM_IJK = camp::idx_seq<0, 1, 2>;
-using PERM_IKJ = camp::idx_seq<0, 2, 1>;
-using PERM_JIK = camp::idx_seq<1, 0, 2>;
-using PERM_JKI = camp::idx_seq<1, 2, 0>;
-using PERM_KIJ = camp::idx_seq<2, 0, 1>;
-using PERM_KJI = camp::idx_seq<2, 1, 0>;
-using PERM_IJKL = camp::idx_seq<0, 1, 2, 3>;
-using PERM_IJLK = camp::idx_seq<0, 1, 3, 2>;
-using PERM_IKJL = camp::idx_seq<0, 2, 1, 3>;
-using PERM_IKLJ = camp::idx_seq<0, 2, 3, 1>;
-using PERM_ILJK = camp::idx_seq<0, 3, 1, 2>;
-using PERM_ILKJ = camp::idx_seq<0, 3, 2, 1>;
-using PERM_JIKL = camp::idx_seq<1, 0, 2, 3>;
-using PERM_JILK = camp::idx_seq<1, 0, 3, 2>;
-using PERM_JKIL = camp::idx_seq<1, 2, 0, 3>;
-using PERM_JKLI = camp::idx_seq<1, 2, 3, 0>;
-using PERM_JLIK = camp::idx_seq<1, 3, 0, 2>;
-using PERM_JLKI = camp::idx_seq<1, 3, 2, 0>;
-using PERM_KIJL = camp::idx_seq<2, 0, 1, 3>;
-using PERM_KILJ = camp::idx_seq<2, 0, 3, 1>;
-using PERM_KJIL = camp::idx_seq<2, 1, 0, 3>;
-using PERM_KJLI = camp::idx_seq<2, 1, 3, 0>;
-using PERM_KLIJ = camp::idx_seq<2, 3, 0, 1>;
-using PERM_KLJI = camp::idx_seq<2, 3, 1, 0>;
-using PERM_LIJK = camp::idx_seq<3, 0, 1, 2>;
-using PERM_LIKJ = camp::idx_seq<3, 0, 2, 1>;
-using PERM_LJIK = camp::idx_seq<3, 1, 0, 2>;
-using PERM_LJKI = camp::idx_seq<3, 1, 2, 0>;
-using PERM_LKIJ = camp::idx_seq<3, 2, 0, 1>;
-using PERM_LKJI = camp::idx_seq<3, 2, 1, 0>;
+using PERM_I     = camp::idx_seq<0>;
+using PERM_IJ    = camp::idx_seq<0, 1>;
+using PERM_JI    = camp::idx_seq<1, 0>;
+using PERM_IJK   = camp::idx_seq<0, 1, 2>;
+using PERM_IKJ   = camp::idx_seq<0, 2, 1>;
+using PERM_JIK   = camp::idx_seq<1, 0, 2>;
+using PERM_JKI   = camp::idx_seq<1, 2, 0>;
+using PERM_KIJ   = camp::idx_seq<2, 0, 1>;
+using PERM_KJI   = camp::idx_seq<2, 1, 0>;
+using PERM_IJKL  = camp::idx_seq<0, 1, 2, 3>;
+using PERM_IJLK  = camp::idx_seq<0, 1, 3, 2>;
+using PERM_IKJL  = camp::idx_seq<0, 2, 1, 3>;
+using PERM_IKLJ  = camp::idx_seq<0, 2, 3, 1>;
+using PERM_ILJK  = camp::idx_seq<0, 3, 1, 2>;
+using PERM_ILKJ  = camp::idx_seq<0, 3, 2, 1>;
+using PERM_JIKL  = camp::idx_seq<1, 0, 2, 3>;
+using PERM_JILK  = camp::idx_seq<1, 0, 3, 2>;
+using PERM_JKIL  = camp::idx_seq<1, 2, 0, 3>;
+using PERM_JKLI  = camp::idx_seq<1, 2, 3, 0>;
+using PERM_JLIK  = camp::idx_seq<1, 3, 0, 2>;
+using PERM_JLKI  = camp::idx_seq<1, 3, 2, 0>;
+using PERM_KIJL  = camp::idx_seq<2, 0, 1, 3>;
+using PERM_KILJ  = camp::idx_seq<2, 0, 3, 1>;
+using PERM_KJIL  = camp::idx_seq<2, 1, 0, 3>;
+using PERM_KJLI  = camp::idx_seq<2, 1, 3, 0>;
+using PERM_KLIJ  = camp::idx_seq<2, 3, 0, 1>;
+using PERM_KLJI  = camp::idx_seq<2, 3, 1, 0>;
+using PERM_LIJK  = camp::idx_seq<3, 0, 1, 2>;
+using PERM_LIKJ  = camp::idx_seq<3, 0, 2, 1>;
+using PERM_LJIK  = camp::idx_seq<3, 1, 0, 2>;
+using PERM_LJKI  = camp::idx_seq<3, 1, 2, 0>;
+using PERM_LKIJ  = camp::idx_seq<3, 2, 0, 1>;
+using PERM_LKJI  = camp::idx_seq<3, 2, 1, 0>;
 using PERM_IJKLM = camp::idx_seq<0, 1, 2, 3, 4>;
 using PERM_IJKML = camp::idx_seq<0, 1, 2, 4, 3>;
 using PERM_IJLKM = camp::idx_seq<0, 1, 3, 2, 4>;
@@ -192,18 +193,17 @@ using PERM_MLJKI = camp::idx_seq<4, 3, 1, 2, 0>;
 using PERM_MLKIJ = camp::idx_seq<4, 3, 2, 0, 1>;
 using PERM_MLKJI = camp::idx_seq<4, 3, 2, 1, 0>;
 
-
-
-
-namespace internal 
+namespace internal
 {
 
 
 template<camp::idx_t I, camp::idx_t J, camp::idx_t N, typename Perm>
 struct CalcInversePermutationElem
 {
-  static constexpr camp::idx_t value = 
-    camp::seq_at<J, Perm>::value == I ? J : CalcInversePermutationElem<I, J+1, N, Perm>::value;
+  static constexpr camp::idx_t value =
+      camp::seq_at<J, Perm>::value == I
+          ? J
+          : CalcInversePermutationElem<I, J + 1, N, Perm>::value;
 };
 
 template<camp::idx_t I, camp::idx_t N, typename Perm>
@@ -213,31 +213,30 @@ struct CalcInversePermutationElem<I, N, N, Perm>
 };
 
 
-
 template<typename Range, typename Perm>
 struct InversePermutationHelper;
 
-template<camp::idx_t ... Range, camp::idx_t ... Perm>
-struct InversePermutationHelper<camp::idx_seq<Range...>, 
-                                camp::idx_seq<Perm...>>
+template<camp::idx_t... Range, camp::idx_t... Perm>
+struct InversePermutationHelper<camp::idx_seq<Range...>, camp::idx_seq<Perm...>>
 {
   static_assert(sizeof...(Range) == sizeof...(Perm), "Fatal Error");
-  using type = camp::idx_seq< 
-    CalcInversePermutationElem<Range, 0, sizeof...(Range), camp::idx_seq<Perm...>>::value ...  
-  >;  
+  using type = camp::idx_seq<
+      CalcInversePermutationElem<Range,
+                                 0,
+                                 sizeof...(Range),
+                                 camp::idx_seq<Perm...>>::value...>;
 };
 
 
-
-} // namespace internal
-
-
+}  // namespace internal
 
 /*!
   Inverts a permutation
 */
 template<typename Perm>
-using invert_permutation = typename internal::InversePermutationHelper<camp::make_idx_seq_t<camp::size<Perm>::value>, Perm>::type;
+using invert_permutation = typename internal::InversePermutationHelper<
+    camp::make_idx_seq_t<camp::size<Perm>::value>,
+    Perm>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/PermutedLayout.hpp b/include/RAJA/util/PermutedLayout.hpp
index 5bb176215b..2b0aad1a7c 100644
--- a/include/RAJA/util/PermutedLayout.hpp
+++ b/include/RAJA/util/PermutedLayout.hpp
@@ -60,41 +60,44 @@ namespace RAJA
  *
  *
  */
-template <size_t Rank, typename IdxLin = Index_type>
+template<size_t Rank, typename IdxLin = Index_type>
 auto make_permuted_layout(std::array<IdxLin, Rank> sizes,
                           std::array<camp::idx_t, Rank> permutation)
     -> Layout<Rank, IdxLin>
 {
   std::array<IdxLin, Rank> strides;
   std::array<IdxLin, Rank> folded_strides;
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     // If the size of dimension i is zero, then the stride is zero
     folded_strides[i] = sizes[permutation[i]] ? 1 : 0;
-    for (size_t j = i + 1; j < Rank; ++j) {
+    for (size_t j = i + 1; j < Rank; ++j)
+    {
       folded_strides[i] *= sizes[permutation[j]] ? sizes[permutation[j]] : 1;
     }
   }
 
-  for (size_t i = 0; i < Rank; ++i) {
+  for (size_t i = 0; i < Rank; ++i)
+  {
     strides[permutation[i]] = folded_strides[i];
   }
 
 
   // return Layout<Rank, IdxLin>(sizes, strides);
-  auto ret  = Layout<Rank, IdxLin>();
-  for (size_t i = 0; i < Rank; ++i) {
-    ret.sizes[i] = sizes[i];
-    ret.strides[i] = strides[i];
+  auto ret = Layout<Rank, IdxLin>();
+  for (size_t i = 0; i < Rank; ++i)
+  {
+    ret.sizes[i]       = sizes[i];
+    ret.strides[i]     = strides[i];
     ret.inv_strides[i] = strides[i] ? strides[i] : 1;
-    ret.inv_mods[i] = sizes[i] ? sizes[i] : 1;
+    ret.inv_mods[i]    = sizes[i] ? sizes[i] : 1;
   }
   return ret;
 }
 
-
-template <camp::idx_t... Ints>
+template<camp::idx_t... Ints>
 using Perm = camp::idx_seq<Ints...>;
-template <camp::idx_t N>
+template<camp::idx_t N>
 using MakePerm = typename camp::make_idx_seq<N>::type;
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/PluginContext.hpp b/include/RAJA/util/PluginContext.hpp
index 996836e397..1cee1672c5 100644
--- a/include/RAJA/util/PluginContext.hpp
+++ b/include/RAJA/util/PluginContext.hpp
@@ -11,31 +11,33 @@
 #include "RAJA/policy/PolicyBase.hpp"
 #include "RAJA/internal/get_platform.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class KokkosPluginLoader;
 
-struct PluginContext {
-  public:
-    PluginContext(const Platform p) :
-      platform(p) {}
+struct PluginContext
+{
+public:
+  PluginContext(const Platform p) : platform(p) {}
 
-    Platform platform;
+  Platform platform;
 
-  private:
-    mutable uint64_t kID;
+private:
+  mutable uint64_t kID;
 
-    friend class KokkosPluginLoader;
+  friend class KokkosPluginLoader;
 };
 
 template<typename Policy>
 PluginContext make_context()
 {
-  return PluginContext{detail::get_platform<Policy>::value};
+  return PluginContext {detail::get_platform<Policy>::value};
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginLinker.hpp b/include/RAJA/util/PluginLinker.hpp
index e5b77bd027..5920142759 100644
--- a/include/RAJA/util/PluginLinker.hpp
+++ b/include/RAJA/util/PluginLinker.hpp
@@ -11,14 +11,18 @@
 #include "RAJA/util/RuntimePluginLoader.hpp"
 #include "RAJA/util/KokkosPluginLoader.hpp"
 
-namespace {
-  namespace anonymous_RAJA {
-    struct pluginLinker {
-      inline pluginLinker() {
-        (void)RAJA::util::linkRuntimePluginLoader();
-        (void)RAJA::util::linkKokkosPluginLoader();
-      }
-    } pluginLinker;
+namespace
+{
+namespace anonymous_RAJA
+{
+struct pluginLinker
+{
+  inline pluginLinker()
+  {
+    (void)RAJA::util::linkRuntimePluginLoader();
+    (void)RAJA::util::linkKokkosPluginLoader();
   }
-}
+} pluginLinker;
+}  // namespace anonymous_RAJA
+}  // namespace
 #endif
diff --git a/include/RAJA/util/PluginOptions.hpp b/include/RAJA/util/PluginOptions.hpp
index f0b6a35507..50ed3a1da9 100644
--- a/include/RAJA/util/PluginOptions.hpp
+++ b/include/RAJA/util/PluginOptions.hpp
@@ -10,22 +10,24 @@
 
 #include <string>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 struct PluginOptions
 {
-    PluginOptions(const std::string& newstr) : str(newstr) {};
-    
-    std::string str;
+  PluginOptions(const std::string& newstr) : str(newstr) {};
+
+  std::string str;
 };
 
 inline PluginOptions make_options(const std::string& newstr)
 {
-    return PluginOptions{newstr};
+  return PluginOptions {newstr};
 }
 
-} // namespace util
-} // namespace RAJA
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/PluginStrategy.hpp b/include/RAJA/util/PluginStrategy.hpp
index 3935559bba..86f8fd7f6b 100644
--- a/include/RAJA/util/PluginStrategy.hpp
+++ b/include/RAJA/util/PluginStrategy.hpp
@@ -12,33 +12,35 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/Registry.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 class PluginStrategy
 {
-  public:
-    RAJASHAREDDLL_API PluginStrategy();
+public:
+  RAJASHAREDDLL_API PluginStrategy();
 
-    virtual ~PluginStrategy() = default;
+  virtual ~PluginStrategy() = default;
 
-    virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
+  virtual RAJASHAREDDLL_API void init(const PluginOptions& p);
 
-    virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postCapture(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void preLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
+  virtual RAJASHAREDDLL_API void postLaunch(const PluginContext& p);
 
-    virtual RAJASHAREDDLL_API void finalize();
+  virtual RAJASHAREDDLL_API void finalize();
 };
 
 using PluginRegistry = Registry<PluginStrategy>;
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 
 #endif
diff --git a/include/RAJA/util/Registry.hpp b/include/RAJA/util/Registry.hpp
index 579481a6ed..17838a8e3a 100644
--- a/include/RAJA/util/Registry.hpp
+++ b/include/RAJA/util/Registry.hpp
@@ -10,126 +10,157 @@
 
 #include <memory>
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
+
+template<typename T>
+class RegistryEntry
+{
+  std::string Name, Desc;
+  std::shared_ptr<T> object;
+
+public:
+  RegistryEntry(const std::string& N,
+                const std::string& D,
+                std::shared_ptr<T> (*C)())
+      : Name(N),
+        Desc(D),
+        object(C())
+  {}
+
+  const std::string& getName() const { return Name; }
+
+  const std::string& getDesc() const { return Desc; }
+
+  T* get() const { return object.get(); }
+};
+
+/// A global registry used in conjunction with static constructors to make
+/// pluggable components (like targets or garbage collectors) "just work" when
+/// linked with an executable.
+template<typename T>
+class Registry
+{
+public:
+  using type  = T;
+  using entry = RegistryEntry<T>;
+
+  class node;
+  class iterator;
+
+private:
+  Registry() = delete;
+
+  friend class node;
+  static node *Head, *Tail;
+
+public:
+  /// Node in linked list of entries.
+  ///
+  class node
+  {
+    friend class iterator;
+    friend Registry<T>;
+
+    node* Next;
+    const entry& Val;
 
-  template <typename T>
-  class RegistryEntry {
-    std::string Name, Desc;
-    std::shared_ptr<T> object;
+  public:
+    node(const entry& V) : Next(nullptr), Val(V) {}
+  };
+
+  /// Add a node to the Registry: this is the interface between the plugin and
+  /// the executable.
+  ///
+  /// This function is exported by the executable and called by the plugin to
+  /// add a node to the executable's registry. Therefore it's not defined here
+  /// to avoid it being instantiated in the plugin and is instead defined in
+  /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
+  static RAJASHAREDDLL_API void add_node(node* N);
+
+  /// Iterators for registry entries.
+  ///
+  class iterator
+  {
+    const node* Cur;
 
   public:
-    RegistryEntry(const std::string& N, const std::string& D,
-        std::shared_ptr<T> (*C)())
-        : Name(N), Desc(D), object(C()) {}
+    explicit iterator(const node* N) : Cur(N) {}
+
+    bool operator==(const iterator& That) const { return Cur == That.Cur; }
+
+    bool operator!=(const iterator& That) const { return Cur != That.Cur; }
 
-    const std::string& getName() const { return Name; }
-    const std::string& getDesc() const { return Desc; }
-    T* get() const { return object.get(); }
+    iterator& operator++()
+    {
+      Cur = Cur->Next;
+      return *this;
+    }
+
+    const entry& operator*() const { return Cur->Val; }
+
+    const entry* operator->() const { return &Cur->Val; }
   };
 
-  /// A global registry used in conjunction with static constructors to make
-  /// pluggable components (like targets or garbage collectors) "just work" when
-  /// linked with an executable.
-  template <typename T>
-  class Registry {
-  public:
-    using type = T;
-    using entry = RegistryEntry<T>;
+  // begin is not defined here in order to avoid usage of an undefined static
+  // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
+  static RAJASHAREDDLL_API iterator begin();
 
-    class node;
-    class iterator;
+  static iterator end() { return iterator(nullptr); }
 
-  private:
-    Registry() = delete;
+  /// A static registration template.
+  template<typename V>
+  class add
+  {
+    entry Entry;
+    node Node;
 
-    friend class node;
-    static node *Head, *Tail;
+    static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
 
   public:
-    /// Node in linked list of entries.
-    ///
-    class node {
-      friend class iterator;
-      friend Registry<T>;
-
-      node *Next;
-      const entry& Val;
-
-    public:
-      node(const entry &V) : Next(nullptr), Val(V) {}
-    };
-
-    /// Add a node to the Registry: this is the interface between the plugin and
-    /// the executable.
-    ///
-    /// This function is exported by the executable and called by the plugin to
-    /// add a node to the executable's registry. Therefore it's not defined here
-    /// to avoid it being instantiated in the plugin and is instead defined in
-    /// the executable (see RAJA_INSTANTIATE_REGISTRY below).
-    static RAJASHAREDDLL_API void add_node(node *N);
-
-    /// Iterators for registry entries.
-    ///
-    class iterator {
-      const node *Cur;
-
-    public:
-      explicit iterator(const node *N) : Cur(N) {}
-
-      bool operator==(const iterator &That) const { return Cur == That.Cur; }
-      bool operator!=(const iterator &That) const { return Cur != That.Cur; }
-      iterator &operator++() { Cur = Cur->Next; return *this; }
-      const entry &operator*() const { return Cur->Val; }
-      const entry *operator->() const { return &Cur->Val; }
-    };
-
-    // begin is not defined here in order to avoid usage of an undefined static
-    // data member, instead it's instantiated by RAJA_INSTANTIATE_REGISTRY.
-    static RAJASHAREDDLL_API iterator begin();
-    static iterator end()   { return iterator(nullptr); }
-
-    /// A static registration template.
-    template <typename V>
-    class add {
-      entry Entry;
-      node Node;
-
-      static std::shared_ptr<T> CtorFn() { return std::make_shared<V>(); }
-
-    public:
-      add(const std::string& Name, const std::string& Desc)
-          : Entry(Name, Desc, CtorFn), Node(Entry) {
-        add_node(&Node);
-      }
-    };
+    add(const std::string& Name, const std::string& Desc)
+        : Entry(Name, Desc, CtorFn),
+          Node(Entry)
+    {
+      add_node(&Node);
+    }
   };
-
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
-
-#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace RAJA { \
-  namespace util { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
-  } \
+};
+
+}  // namespace util
+}  // namespace RAJA
+
+#define RAJA_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace RAJA                                                               \
+  {                                                                            \
+  namespace util                                                               \
+  {                                                                            \
+  template<typename T>                                                         \
+  typename Registry<T>::node* Registry<T>::Head = nullptr;                     \
+  template<typename T>                                                         \
+  typename Registry<T>::node* Registry<T>::Tail = nullptr;                     \
+  template<typename T>                                                         \
+  void Registry<T>::add_node(typename Registry<T>::node* N)                    \
+  {                                                                            \
+    if (Tail)                                                                  \
+      Tail->Next = N;                                                          \
+    else                                                                       \
+      Head = N;                                                                \
+    Tail = N;                                                                  \
+  }                                                                            \
+  template<typename T>                                                         \
+  typename Registry<T>::iterator Registry<T>::begin()                          \
+  {                                                                            \
+    return iterator(Head);                                                     \
+  }                                                                            \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Head;         \
+  template REGISTRY_CLASS::node* Registry<REGISTRY_CLASS::type>::Tail;         \
+  template void Registry<REGISTRY_CLASS::type>::add_node(                      \
+      REGISTRY_CLASS::node*);                                                  \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin();   \
+  }                                                                            \
   }
 
 #endif
diff --git a/include/RAJA/util/RepeatView.hpp b/include/RAJA/util/RepeatView.hpp
index 618913f794..1795ee2e2a 100644
--- a/include/RAJA/util/RepeatView.hpp
+++ b/include/RAJA/util/RepeatView.hpp
@@ -50,83 +50,158 @@ namespace RAJA
  *   unbounded extents
  *
  */
-template < typename T >
+template<typename T>
 struct RepeatView
 {
   struct iterator
   {
     using difference_type = std::ptrdiff_t;
-    using value_type = T;
-    using reference = value_type const&;
+    using value_type      = T;
+    using reference       = value_type const&;
 
     iterator() = default;
 
     constexpr iterator(const T* base, size_t index)
-      : m_value(base), m_index(index)
-    { }
+        : m_value(base),
+          m_index(index)
+    {}
 
     constexpr reference operator*() const noexcept { return *m_value; }
-    constexpr reference operator[](difference_type index) const noexcept { return *(*this + index); }
 
-    constexpr iterator& operator++() { ++m_index; return *this; }
-    constexpr iterator operator++(int) { auto tmp = *this; ++(*this); return tmp; }
-
-    constexpr iterator& operator--() { --m_index; return *this; }
-    constexpr iterator operator--(int) { auto tmp = *this; --(*this); return tmp; }
-
-    constexpr iterator& operator+=(difference_type rhs) { m_index += rhs; return *this; }
-    constexpr iterator& operator-=(difference_type rhs) { m_index -= rhs; return *this; }
+    constexpr reference operator[](difference_type index) const noexcept
+    {
+      return *(*this + index);
+    }
+
+    constexpr iterator& operator++()
+    {
+      ++m_index;
+      return *this;
+    }
+
+    constexpr iterator operator++(int)
+    {
+      auto tmp = *this;
+      ++(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator--()
+    {
+      --m_index;
+      return *this;
+    }
+
+    constexpr iterator operator--(int)
+    {
+      auto tmp = *this;
+      --(*this);
+      return tmp;
+    }
+
+    constexpr iterator& operator+=(difference_type rhs)
+    {
+      m_index += rhs;
+      return *this;
+    }
+
+    constexpr iterator& operator-=(difference_type rhs)
+    {
+      m_index -= rhs;
+      return *this;
+    }
 
     friend constexpr iterator operator+(iterator lhs, difference_type rhs)
-    { lhs += rhs; return lhs; }
+    {
+      lhs += rhs;
+      return lhs;
+    }
+
     friend constexpr iterator operator+(difference_type lhs, iterator rhs)
-    { rhs += lhs; return rhs; }
+    {
+      rhs += lhs;
+      return rhs;
+    }
 
     friend constexpr iterator operator-(iterator lhs, difference_type rhs)
-    { lhs -= rhs; return lhs; }
-    friend constexpr difference_type operator-(iterator const& lhs, iterator const& rhs)
-    { return static_cast<difference_type>(lhs.m_index) - static_cast<difference_type>(rhs.m_index); }
+    {
+      lhs -= rhs;
+      return lhs;
+    }
+
+    friend constexpr difference_type operator-(iterator const& lhs,
+                                               iterator const& rhs)
+    {
+      return static_cast<difference_type>(lhs.m_index) -
+             static_cast<difference_type>(rhs.m_index);
+    }
 
     friend constexpr bool operator==(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index == rhs.m_index; }
+    {
+      return lhs.m_index == rhs.m_index;
+    }
+
     friend constexpr bool operator!=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs == rhs); }
+    {
+      return !(lhs == rhs);
+    }
 
     friend constexpr bool operator<(iterator const& lhs, iterator const& rhs)
-    { return lhs.m_index < rhs.m_index; }
+    {
+      return lhs.m_index < rhs.m_index;
+    }
+
     friend constexpr bool operator<=(iterator const& lhs, iterator const& rhs)
-    { return !(rhs < lhs); }
+    {
+      return !(rhs < lhs);
+    }
+
     friend constexpr bool operator>(iterator const& lhs, iterator const& rhs)
-    { return rhs < lhs; }
+    {
+      return rhs < lhs;
+    }
+
     friend constexpr bool operator>=(iterator const& lhs, iterator const& rhs)
-    { return !(lhs < rhs); }
+    {
+      return !(lhs < rhs);
+    }
 
   private:
     const T* m_value = nullptr;
-    size_t m_index = 0;
+    size_t m_index   = 0;
   };
 
   RepeatView() = delete;
 
   constexpr RepeatView(T const& value, size_t bound)
-    : m_bound(bound), m_value(value)
-  { }
+      : m_bound(bound),
+        m_value(value)
+  {}
 
   constexpr RepeatView(T&& value, size_t bound)
-    : m_bound(bound), m_value(std::move(value))
-  { }
+      : m_bound(bound),
+        m_value(std::move(value))
+  {}
 
   constexpr T const& front() const { return m_value; }
+
   constexpr T const& back() const { return m_value; }
-  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const { return m_value; }
+
+  constexpr T const& operator[](size_t RAJA_UNUSED_ARG(index)) const
+  {
+    return m_value;
+  }
 
   constexpr iterator begin() const { return iterator(&m_value, 0); }
+
   constexpr iterator cbegin() const { return iterator(&m_value, 0); }
 
   constexpr iterator end() const { return iterator(&m_value, m_bound); }
+
   constexpr iterator cend() const { return iterator(&m_value, m_bound); }
 
   constexpr explicit operator bool() const { return m_bound != 0; }
+
   constexpr bool empty() const { return m_bound == 0; }
 
   constexpr size_t size() const { return m_bound; }
diff --git a/include/RAJA/util/RuntimePluginLoader.hpp b/include/RAJA/util/RuntimePluginLoader.hpp
index 3e7fbb165f..289e067b0a 100644
--- a/include/RAJA/util/RuntimePluginLoader.hpp
+++ b/include/RAJA/util/RuntimePluginLoader.hpp
@@ -14,39 +14,40 @@
 #include "RAJA/util/PluginOptions.hpp"
 #include "RAJA/util/PluginStrategy.hpp"
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-  class RuntimePluginLoader : public RAJA::util::PluginStrategy
-  {
-    using Parent = RAJA::util::PluginStrategy;
+class RuntimePluginLoader : public RAJA::util::PluginStrategy
+{
+  using Parent = RAJA::util::PluginStrategy;
 
-  public:
-    RuntimePluginLoader();
+public:
+  RuntimePluginLoader();
 
-    void init(const RAJA::util::PluginOptions& p) override;
+  void init(const RAJA::util::PluginOptions& p) override;
 
-    void preCapture(const RAJA::util::PluginContext& p) override;
+  void preCapture(const RAJA::util::PluginContext& p) override;
 
-    void postCapture(const RAJA::util::PluginContext& p) override;
+  void postCapture(const RAJA::util::PluginContext& p) override;
 
-    void preLaunch(const RAJA::util::PluginContext& p) override;
+  void preLaunch(const RAJA::util::PluginContext& p) override;
 
-    void postLaunch(const RAJA::util::PluginContext& p) override;
+  void postLaunch(const RAJA::util::PluginContext& p) override;
 
-    void finalize() override;
+  void finalize() override;
 
-  private:
+private:
+  void initPlugin(const std::string& path);
 
-    void initPlugin(const std::string &path);
-    
-    void initDirectory(const std::string &path);
+  void initDirectory(const std::string& path);
 
-    std::vector<std::unique_ptr<Parent>> plugins;
+  std::vector<std::unique_ptr<Parent>> plugins;
 
-  };  // end RuntimePluginLoader class
+};  // end RuntimePluginLoader class
 
-  void linkRuntimePluginLoader();
+void linkRuntimePluginLoader();
 
 }  // end namespace util
 }  // end namespace RAJA
diff --git a/include/RAJA/util/SoAArray.hpp b/include/RAJA/util/SoAArray.hpp
index 6828bc3b1a..bd29de7efe 100644
--- a/include/RAJA/util/SoAArray.hpp
+++ b/include/RAJA/util/SoAArray.hpp
@@ -35,13 +35,14 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <typename T, size_t size>
+template<typename T, size_t size>
 class SoAArray
 {
   using value_type = T;
 
 public:
   RAJA_HOST_DEVICE value_type get(size_t i) const { return mem[i]; }
+
   RAJA_HOST_DEVICE void set(size_t i, value_type val) { mem[i] = val; }
 
 private:
@@ -51,11 +52,11 @@ class SoAArray
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, size_t size>
-class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
+template<typename T, typename IndexType, bool doing_min, size_t size>
+class SoAArray<::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
 {
-  using value_type = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
-  using first_type = T;
+  using value_type  = ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
+  using first_type  = T;
   using second_type = IndexType;
 
 public:
@@ -63,9 +64,10 @@ class SoAArray< ::RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, size>
   {
     return value_type(mem[i], mem_idx[i]);
   }
+
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
-    mem[i] = val;
+    mem[i]     = val;
     mem_idx[i] = val.getLoc();
   }
 
diff --git a/include/RAJA/util/SoAPtr.hpp b/include/RAJA/util/SoAPtr.hpp
index 6adea65b80..dfaef29739 100644
--- a/include/RAJA/util/SoAPtr.hpp
+++ b/include/RAJA/util/SoAPtr.hpp
@@ -42,38 +42,38 @@ namespace detail
  * This is useful for creating a vectorizable data layout and getting
  * coalesced memory accesses or avoiding shared memory bank conflicts in cuda.
  */
-template <typename T,
-          typename mempool = RAJA::basic_mempool::MemPool<
-              RAJA::basic_mempool::generic_allocator>,
-          typename accessor = DefaultAccessor >
+template<typename T,
+         typename mempool = RAJA::basic_mempool::MemPool<
+             RAJA::basic_mempool::generic_allocator>,
+         typename accessor = DefaultAccessor>
 class SoAPtr
 {
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template<typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = T;
 
-  template < typename rhs_accessor >
+  template<typename rhs_accessor>
   using rebind_accessor = SoAPtr<T, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<value_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-  { }
+  template<
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
@@ -90,8 +90,15 @@ class SoAPtr
 
   RAJA_HOST_DEVICE bool allocated() const { return mem != nullptr; }
 
-  RAJA_HOST_DEVICE value_type get(size_t i) const { return accessor::get(mem, i); }
-  RAJA_HOST_DEVICE void set(size_t i, value_type val) { accessor::set(mem, i, val); }
+  RAJA_HOST_DEVICE value_type get(size_t i) const
+  {
+    return accessor::get(mem, i);
+  }
+
+  RAJA_HOST_DEVICE void set(size_t i, value_type val)
+  {
+    accessor::set(mem, i, val);
+  }
 
 private:
   value_type* mem = nullptr;
@@ -100,44 +107,50 @@ class SoAPtr
 /*!
  * @brief Specialization for RAJA::reduce::detail::ValueLoc.
  */
-template <typename T, typename IndexType, bool doing_min, typename mempool, typename accessor>
-class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, accessor>
+template<typename T,
+         typename IndexType,
+         bool doing_min,
+         typename mempool,
+         typename accessor>
+class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>,
+             mempool,
+             accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // fiend other instantiations of this class
+  template<typename, typename, typename>
+  friend class SoAPtr;  // fiend other instantiations of this class
 
 public:
   using value_type = RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>;
 
-  template < typename rhs_accessor >
+  template<typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template<
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem),
+        mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -157,6 +170,7 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   {
     return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
+
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     accessor::set(mem, i, first_type(val));
@@ -164,51 +178,51 @@ class SoAPtr<RAJA::reduce::detail::ValueLoc<T, IndexType, doing_min>, mempool, a
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
 /*!
  * @brief Specialization for RAJA::expt::ValLoc.
  */
-template <typename T, typename IndexType, typename mempool, typename accessor>
+template<typename T, typename IndexType, typename mempool, typename accessor>
 class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
 {
-  using first_type = T;
+  using first_type  = T;
   using second_type = IndexType;
 
-  template < typename, typename, typename >
-  friend class SoAPtr; // friend other instantiations of this class
+  template<typename, typename, typename>
+  friend class SoAPtr;  // friend other instantiations of this class
 
 public:
   using value_type = RAJA::expt::ValLoc<T, IndexType>;
 
-  template < typename rhs_accessor >
+  template<typename rhs_accessor>
   using rebind_accessor = SoAPtr<value_type, mempool, rhs_accessor>;
 
-  SoAPtr() = default;
-  SoAPtr(SoAPtr const&) = default;
-  SoAPtr(SoAPtr &&) = default;
+  SoAPtr()                         = default;
+  SoAPtr(SoAPtr const&)            = default;
+  SoAPtr(SoAPtr&&)                 = default;
   SoAPtr& operator=(SoAPtr const&) = default;
-  SoAPtr& operator=(SoAPtr &&) = default;
+  SoAPtr& operator=(SoAPtr&&)      = default;
 
   explicit SoAPtr(size_t size)
       : mem(mempool::getInstance().template malloc<first_type>(size)),
         mem_idx(mempool::getInstance().template malloc<second_type>(size))
-  {
-  }
+  {}
 
-  template < typename rhs_accessor,
-             std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr >
-  RAJA_HOST_DEVICE
-  explicit SoAPtr(SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
-    : mem(rhs.mem)
-    , mem_idx(rhs.mem_idx)
-  { }
+  template<
+      typename rhs_accessor,
+      std::enable_if_t<!std::is_same<accessor, rhs_accessor>::value>* = nullptr>
+  RAJA_HOST_DEVICE explicit SoAPtr(
+      SoAPtr<value_type, mempool, rhs_accessor> const& rhs)
+      : mem(rhs.mem),
+        mem_idx(rhs.mem_idx)
+  {}
 
   SoAPtr& allocate(size_t size)
   {
-    mem = mempool::getInstance().template malloc<first_type>(size);
+    mem     = mempool::getInstance().template malloc<first_type>(size);
     mem_idx = mempool::getInstance().template malloc<second_type>(size);
     return *this;
   }
@@ -228,6 +242,7 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   {
     return value_type(accessor::get(mem, i), accessor::get(mem_idx, i));
   }
+
   RAJA_HOST_DEVICE void set(size_t i, value_type val)
   {
     accessor::set(mem, i, val.getVal());
@@ -235,7 +250,7 @@ class SoAPtr<RAJA::expt::ValLoc<T, IndexType>, mempool, accessor>
   }
 
 private:
-  first_type* mem = nullptr;
+  first_type* mem      = nullptr;
   second_type* mem_idx = nullptr;
 };
 
diff --git a/include/RAJA/util/Span.hpp b/include/RAJA/util/Span.hpp
index 2da2e0164c..ba3be9dc4e 100644
--- a/include/RAJA/util/Span.hpp
+++ b/include/RAJA/util/Span.hpp
@@ -55,16 +55,17 @@ namespace RAJA
  *   compile time extents
  *
  */
-template <typename IterType, typename IndexType>
-struct Span {
-  using element_type = typename std::iterator_traits<IterType>::value_type;
-  using value_type = camp::decay<element_type>;
-  using size_type = IndexType;
+template<typename IterType, typename IndexType>
+struct Span
+{
+  using element_type    = typename std::iterator_traits<IterType>::value_type;
+  using value_type      = camp::decay<element_type>;
+  using size_type       = IndexType;
   using difference_type = std::ptrdiff_t;
-  using reference = element_type&;
+  using reference       = element_type&;
   using const_reference = const element_type&;
-  using iterator = IterType;
-  using const_iterator = IterType;
+  using iterator        = IterType;
+  using const_iterator  = IterType;
 
   static_assert(type_traits::is_integral<IndexType>::value,
                 "IndexType must model Integral");
@@ -72,32 +73,63 @@ struct Span {
                 "IterType must model RandomAccessIterator");
 
   RAJA_HOST_DEVICE Span(iterator begin, iterator end)
-      : m_begin{begin}, m_end{end}
-  {
-  }
+      : m_begin {begin},
+        m_end {end}
+  {}
 
   RAJA_HOST_DEVICE Span(iterator begin, size_type size)
-      : m_begin{begin}, m_end{begin + size}
-  {
-  }
+      : m_begin {begin},
+        m_end {begin + size}
+  {}
 
   RAJA_HOST_DEVICE RAJA_INLINE iterator begin() { return m_begin; }
+
   RAJA_HOST_DEVICE RAJA_INLINE iterator end() { return m_end; }
+
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator begin() const { return m_begin; }
+
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator end() const { return m_end; }
+
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cbegin() const { return m_begin; }
+
   RAJA_HOST_DEVICE RAJA_INLINE const_iterator cend() const { return m_end; }
 
-  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s) { return s.begin(); }
+  RAJA_HOST_DEVICE RAJA_INLINE friend iterator begin(Span& s)
+  {
+    return s.begin();
+  }
+
   RAJA_HOST_DEVICE RAJA_INLINE friend iterator end(Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s) { return s.begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s) { return s.end(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s) { return s.cbegin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s) { return s.cend(); }
+
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator begin(const Span& s)
+  {
+    return s.begin();
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator end(const Span& s)
+  {
+    return s.end();
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cbegin(const Span& s)
+  {
+    return s.cbegin();
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE friend const_iterator cend(const Span& s)
+  {
+    return s.cend();
+  }
 
   RAJA_HOST_DEVICE RAJA_INLINE reference front() const { return *begin(); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end()-1); }
-  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const { return data()[i]; }
+
+  RAJA_HOST_DEVICE RAJA_INLINE reference back() const { return *(end() - 1); }
+
+  RAJA_HOST_DEVICE RAJA_INLINE reference operator[](size_type i) const
+  {
+    return data()[i];
+  }
+
   RAJA_HOST_DEVICE RAJA_INLINE iterator data() const { return m_begin; }
 
   RAJA_HOST_DEVICE RAJA_INLINE size_type size() const
@@ -114,20 +146,23 @@ struct Span {
   {
     return slice(0, count);
   }
+
   RAJA_HOST_DEVICE RAJA_INLINE Span last(size_type count) const
   {
     return slice(size() - count, count);
   }
+
   RAJA_HOST_DEVICE RAJA_INLINE Span subspan(size_type begin,
                                             size_type length) const
   {
     return slice(begin, length);
   }
+
   RAJA_HOST_DEVICE RAJA_INLINE Span slice(size_type begin,
                                           size_type length) const
   {
     auto start = m_begin + begin;
-    auto end = start + length > m_end ? m_end : start + length;
+    auto end   = start + length > m_end ? m_end : start + length;
     return Span(start, end);
   }
 
@@ -156,22 +191,22 @@ struct Span {
  *     RAJA::inclusive_scan_inplace<policy>(my_span);
  *
  */
-template <typename IterType, typename IndexType>
-RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(
-    IterType begin,
-    IndexType size)
+template<typename IterType, typename IndexType>
+RAJA_HOST_DEVICE RAJA_INLINE Span<IterType, IndexType> make_span(IterType begin,
+                                                                 IndexType size)
 {
   return Span<IterType, IndexType>(begin, size);
 }
 
-template <typename Iter>
-RAJA_INLINE auto make_span(Iter &iterable)
+template<typename Iter>
+RAJA_INLINE auto make_span(Iter& iterable)
 {
   using std::begin;
-  using std::end;
   using std::distance;
-  return Span<typename Iter::iterator, decltype(distance(begin(iterable), end(iterable)))>
-    (begin(iterable), end(iterable));
+  using std::end;
+  return Span<typename Iter::iterator,
+              decltype(distance(begin(iterable), end(iterable)))>(
+      begin(iterable), end(iterable));
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/StaticLayout.hpp b/include/RAJA/util/StaticLayout.hpp
index 8d27980f83..839a091eeb 100644
--- a/include/RAJA/util/StaticLayout.hpp
+++ b/include/RAJA/util/StaticLayout.hpp
@@ -31,8 +31,6 @@
 #include "RAJA/util/Operators.hpp"
 #include "RAJA/util/Permutations.hpp"
 
-
-
 namespace RAJA
 {
 
@@ -40,27 +38,31 @@ namespace detail
 {
 
 
-template <typename IdxLin, typename Range, typename Sizes, typename Strides, typename DimTypeList=void>
+template<typename IdxLin,
+         typename Range,
+         typename Sizes,
+         typename Strides,
+         typename DimTypeList = void>
 struct StaticLayoutBase_impl;
 
-
-template <typename IdxLin,
-          IdxLin... RangeInts,
-          IdxLin... Sizes,
-          IdxLin... Strides>
+template<typename IdxLin,
+         IdxLin... RangeInts,
+         IdxLin... Sizes,
+         IdxLin... Strides>
 struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             void> {
+                             void>
+{
 
   using IndexLinear = IdxLin;
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
-  using strides = camp::int_seq<IdxLin, Strides...>;
+  using sizes       = camp::int_seq<IdxLin, Sizes...>;
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  static constexpr camp::idx_t stride_one_dim =
-      RAJA::max<camp::idx_t>(
-          (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts) : -1)...);
+  static constexpr camp::idx_t stride_one_dim = RAJA::max<camp::idx_t>(
+      (camp::seq_at<RangeInts, strides>::value == 1 ? camp::idx_t(RangeInts)
+                                                    : -1)...);
 
   static constexpr size_t n_dims = sizeof...(Sizes);
 
@@ -72,12 +74,9 @@ struct StaticLayoutBase_impl<IdxLin,
   RAJA_INLINE static void print()
   {
     camp::sink(printf("StaticLayout: arg%d: size=%d, stride=%d\n",
-                               (int)RangeInts,
-                               (int)Sizes,
-                               (int)Strides)...);
+                      (int)RangeInts, (int)Sizes, (int)Strides)...);
   }
 
-
   /*!
    * Computes a linear space index from specified indices.
    * This is formed by the dot product of the indices and the layout strides.
@@ -85,7 +84,7 @@ struct StaticLayoutBase_impl<IdxLin,
    * @param indices  Indices in the n-dimensional space of this layout
    * @return Linear space index.
    */
-  template <typename... Indices>
+  template<typename... Indices>
   RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin operator()(
       Indices... indices) const
   {
@@ -93,23 +92,21 @@ struct StaticLayoutBase_impl<IdxLin,
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
   }
 
-
-  template <typename... Indices>
-  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(Indices... indices)
+  template<typename... Indices>
+  static RAJA_INLINE RAJA_HOST_DEVICE constexpr IdxLin s_oper(
+      Indices... indices)
   {
     // dot product of strides and indices
     return RAJA::sum<IdxLin>((IdxLin(indices * Strides))...);
   }
 
-
   // Multiply together all of the sizes,
   // replacing 1 for any zero-sized dimensions
   static constexpr IdxLin s_size =
       RAJA::product<IdxLin>((Sizes == IdxLin(0) ? IdxLin(1) : Sizes)...);
 
   // Multiply together all of the sizes
-  static constexpr IdxLin s_size_noproj =
-      RAJA::product<IdxLin>(Sizes...);
+  static constexpr IdxLin s_size_noproj = RAJA::product<IdxLin>(Sizes...);
 
   /*!
    * Computes a size of the layout's space with projections as size 1.
@@ -136,38 +133,31 @@ struct StaticLayoutBase_impl<IdxLin,
     return s_size_noproj;
   }
 
-
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
     return camp::seq_at<DIM, strides>::value;
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
-
 };
 
-template <typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
-struct StrideCalculatorIdx {
+template<typename IdxLin, IdxLin N, IdxLin Idx, IdxLin... Sizes>
+struct StrideCalculatorIdx
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  using sizes_seq = camp::int_seq<IdxLin, Sizes...>;
+  using sizes_seq              = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin size = camp::seq_at<Idx, sizes_seq>::value;
   static constexpr IdxLin size_last =
       StrideCalculatorIdx<IdxLin, N, Idx + 1, Sizes...>::size;
@@ -177,63 +167,71 @@ struct StrideCalculatorIdx {
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
-template <typename IdxLin, IdxLin N, IdxLin... Sizes>
-struct StrideCalculatorIdx<IdxLin, N, N, Sizes...> {
+template<typename IdxLin, IdxLin N, IdxLin... Sizes>
+struct StrideCalculatorIdx<IdxLin, N, N, Sizes...>
+{
   static_assert(N == sizeof...(Sizes), "");
 
-  static constexpr IdxLin size = 1;
-  static constexpr IdxLin value = 1;
+  static constexpr IdxLin size   = 1;
+  static constexpr IdxLin value  = 1;
   static constexpr IdxLin stride = size > 0 ? value : 0;
 };
 
-template <typename IdxLin, typename Range, typename Perm, typename Sizes>
+template<typename IdxLin, typename Range, typename Perm, typename Sizes>
 struct StrideCalculator;
 
-template <typename IdxLin, IdxLin ... Range, camp::idx_t... Perm, IdxLin... Sizes>
+template<typename IdxLin, IdxLin... Range, camp::idx_t... Perm, IdxLin... Sizes>
 struct StrideCalculator<IdxLin,
                         camp::int_seq<IdxLin, Range...>,
                         camp::idx_seq<Perm...>,
-                        camp::int_seq<IdxLin, Sizes...>> {
+                        camp::int_seq<IdxLin, Sizes...>>
+{
   static_assert(sizeof...(Sizes) == sizeof...(Perm), "");
 
-  using sizes = camp::int_seq<IdxLin, Sizes...>;
+  using sizes               = camp::int_seq<IdxLin, Sizes...>;
   static constexpr IdxLin N = sizeof...(Sizes);
-  using range = camp::int_seq<IdxLin, Range...>;
-  using perm = camp::idx_seq<Perm...>;
-  using inv_perm = invert_permutation<perm>;
-
-  using strides_unperm =
-      camp::int_seq<IdxLin, StrideCalculatorIdx<IdxLin, N, Range, camp::seq_at<Perm, sizes>::value...>::stride...>;
-
-  using strides = camp::int_seq<IdxLin, camp::seq_at<camp::seq_at<Range, inv_perm>::value, strides_unperm>::value...>;
+  using range               = camp::int_seq<IdxLin, Range...>;
+  using perm                = camp::idx_seq<Perm...>;
+  using inv_perm            = invert_permutation<perm>;
+
+  using strides_unperm = camp::int_seq<
+      IdxLin,
+      StrideCalculatorIdx<IdxLin,
+                          N,
+                          Range,
+                          camp::seq_at<Perm, sizes>::value...>::stride...>;
+
+  using strides =
+      camp::int_seq<IdxLin,
+                    camp::seq_at<camp::seq_at<Range, inv_perm>::value,
+                                 strides_unperm>::value...>;
 };
 
-
-
-template <typename IdxLin,
-          IdxLin... RangeInts,
-          IdxLin... Sizes,
-          IdxLin... Strides,
-          typename... DimTypes>
+template<typename IdxLin,
+         IdxLin... RangeInts,
+         IdxLin... Sizes,
+         IdxLin... Strides,
+         typename... DimTypes>
 struct StaticLayoutBase_impl<IdxLin,
                              camp::int_seq<IdxLin, RangeInts...>,
                              camp::int_seq<IdxLin, Sizes...>,
                              camp::int_seq<IdxLin, Strides...>,
-                             camp::list<DimTypes...>> {
+                             camp::list<DimTypes...>>
+{
 
 
   using IndexLinear = IdxLin;
   using ranges      = camp::int_seq<IdxLin, RangeInts...>;
   using sizes       = camp::int_seq<IdxLin, Sizes...>;
-  using strides     = camp::int_seq<IdxLin, Strides...>;  
+  using strides     = camp::int_seq<IdxLin, Strides...>;
 
-  using InnerLayout = StaticLayoutBase_impl<IdxLin,ranges,sizes,strides,void>;
+  using InnerLayout =
+      StaticLayoutBase_impl<IdxLin, ranges, sizes, strides, void>;
 
-  static
-  constexpr
-  camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
+  static constexpr camp::idx_t stride_one_dim = InnerLayout::stride_one_dim;
 
   static constexpr IndexLinear n_dims = sizeof...(DimTypes);
+
   /*!
    * Computes a linear space index from specified indices.
    * This is formed by the dot product of the indices and the layout strides.
@@ -247,8 +245,7 @@ struct StaticLayoutBase_impl<IdxLin,
     return InnerLayout::s_oper(stripIndexType(indices)...);
   }
 
-
-  static constexpr IndexLinear s_size = InnerLayout::s_size;
+  static constexpr IndexLinear s_size        = InnerLayout::s_size;
   static constexpr IndexLinear s_size_noproj = InnerLayout::s_size_noproj;
 
   RAJA_INLINE RAJA_HOST_DEVICE constexpr static IndexLinear size()
@@ -262,71 +259,63 @@ struct StaticLayoutBase_impl<IdxLin,
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_stride() const {
-    return InnerLayout{}.get_dim_stride();
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_stride() const
+  {
+    return InnerLayout {}.get_dim_stride();
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_size() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_size() const
+  {
     return camp::seq_at<DIM, sizes>::value;
   }
 
   template<camp::idx_t DIM>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  constexpr
-  IndexLinear get_dim_begin() const {
+  RAJA_INLINE RAJA_HOST_DEVICE constexpr IndexLinear get_dim_begin() const
+  {
     return 0;
   }
 
-
   RAJA_INLINE
   static void print() { InnerLayout::print(); }
-
 };
 
-
-
-
-
-template <typename Perm, typename IdxLin, typename Sizes, typename Indexes, typename TypeList>
+template<typename Perm,
+         typename IdxLin,
+         typename Sizes,
+         typename Indexes,
+         typename TypeList>
 struct StaticLayoutMaker
 {
-  using strides = typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
-  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides,TypeList>;
+  using strides =
+      typename detail::StrideCalculator<IdxLin, Indexes, Perm, Sizes>::strides;
+  using type = StaticLayoutBase_impl<IdxLin, Indexes, Sizes, strides, TypeList>;
 };
 
 
-
 }  // namespace detail
 
-
-template <typename Perm, typename IdxLin, camp::idx_t... Sizes>
+template<typename Perm, typename IdxLin, camp::idx_t... Sizes>
 using StaticLayoutT = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    void
-    >::type;
+    void>::type;
 
-template <typename Perm, camp::idx_t... Sizes>
+template<typename Perm, camp::idx_t... Sizes>
 using StaticLayout = StaticLayoutT<Perm, camp::idx_t, Sizes...>;
 
-template <typename Perm, typename IdxLin, typename TypeList, camp::idx_t... Sizes>
+template<typename Perm,
+         typename IdxLin,
+         typename TypeList,
+         camp::idx_t... Sizes>
 using TypedStaticLayout = typename detail::StaticLayoutMaker<
     Perm,
     IdxLin,
     camp::int_seq<IdxLin, Sizes...>,
     camp::make_int_seq_t<IdxLin, sizeof...(Sizes)>,
-    TypeList
-    >::type;
+    TypeList>::type;
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/Timer.hpp b/include/RAJA/util/Timer.hpp
index 8c23a2c74d..6823926d08 100644
--- a/include/RAJA/util/Timer.hpp
+++ b/include/RAJA/util/Timer.hpp
@@ -51,7 +51,7 @@ class BGQTimer
   using ElapsedType = double;
 
 private:
-  using TimeType = timeval;
+  using TimeType     = timeval;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
@@ -104,14 +104,13 @@ class ChronoTimer
   using ElapsedType = double;
 
 private:
-  using ClockType = std::chrono::steady_clock;
-  using TimeType = ClockType::time_point;
+  using ClockType    = std::chrono::steady_clock;
+  using TimeType     = ClockType::time_point;
   using DurationType = std::chrono::duration<ElapsedType>;
 
 public:
   ChronoTimer() : tstart(ClockType::now()), tstop(ClockType::now()), telapsed(0)
-  {
-  }
+  {}
 
   void start() { tstart = ClockType::now(); }
 
@@ -174,7 +173,7 @@ class GettimeTimer
 
   void reset()
   {
-    stime_elapsed = 0;
+    stime_elapsed  = 0;
     nstime_elapsed = 0;
   }
 
@@ -200,6 +199,7 @@ using TimerBase = GettimeTimer;
 #elif defined(RAJA_USE_CLOCK)
 
 #include <time.h>
+
 namespace RAJA
 {
 
@@ -266,9 +266,11 @@ class Timer : public TimerBase
 
 #if defined(RAJA_USE_CALIPER)
   void start(const char* name) { cali::Annotation(name).begin(); }
+
   void stop(const char* name) { cali::Annotation(name).end(); }
 #else
   void start(const char*) { start(); }
+
   void stop(const char*) { stop(); }
 #endif
 };
diff --git a/include/RAJA/util/TypeConvert.hpp b/include/RAJA/util/TypeConvert.hpp
index 5cdc019259..c2cbe65732 100644
--- a/include/RAJA/util/TypeConvert.hpp
+++ b/include/RAJA/util/TypeConvert.hpp
@@ -28,7 +28,6 @@
 
 #include <string.h>
 
-
 namespace RAJA
 {
 namespace util
@@ -38,8 +37,8 @@ namespace util
 /*!
  * Reinterpret any datatype as another datatype of the same size
  */
-template <typename A, typename B>
-RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const &a)
+template<typename A, typename B>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr B reinterp_A_as_B(A const& a)
 {
   static_assert(sizeof(A) == sizeof(B), "A and B must be the same size");
 
diff --git a/include/RAJA/util/TypedViewBase.hpp b/include/RAJA/util/TypedViewBase.hpp
index 0d5bed35d6..22f1a5f010 100644
--- a/include/RAJA/util/TypedViewBase.hpp
+++ b/include/RAJA/util/TypedViewBase.hpp
@@ -38,705 +38,786 @@ namespace RAJA
 namespace internal
 {
 
-  template<camp::idx_t, typename T>
-  struct IndexToType{
-      using type = T;
-  };
-
-  template<typename IdxSeq, typename T>
-  struct SequenceToType;
-
-  template<camp::idx_t ... Perm, typename T>
-  struct SequenceToType<camp::idx_seq<Perm...>, T>{
-      using type =  camp::list<typename IndexToType<Perm, T>::type...>;
-  };
-
-  template<typename Perm>
-  using getDefaultIndexTypes = typename SequenceToType<Perm, RAJA::Index_type>::type;
-
-
+template<camp::idx_t, typename T>
+struct IndexToType
+{
+  using type = T;
+};
 
+template<typename IdxSeq, typename T>
+struct SequenceToType;
 
-  //Helpers to convert
-  //layouts -> OffsetLayouts
-  //Typedlayouts -> TypedOffsetLayouts
-  template<typename layout>
-  struct add_offset
-  {
-    using type = RAJA::OffsetLayout<layout::n_dims>;
-  };
+template<camp::idx_t... Perm, typename T>
+struct SequenceToType<camp::idx_seq<Perm...>, T>
+{
+  using type = camp::list<typename IndexToType<Perm, T>::type...>;
+};
 
-  template<typename IdxLin, typename...DimTypes>
-  struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
-  {
-    using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
-  };
+template<typename Perm>
+using getDefaultIndexTypes =
+    typename SequenceToType<Perm, RAJA::Index_type>::type;
 
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
+template<typename layout>
+struct add_offset
+{
+  using type = RAJA::OffsetLayout<layout::n_dims>;
+};
 
+template<typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
+{
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
+};
 
 
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  namespace detail
-  {
-    /*
-     * Returns the argument number which contains a VectorIndex
-     *
-     * returns -1 if none of the arguments are VectorIndexs
-     */
+namespace detail
+{
+/*
+ * Returns the argument number which contains a VectorIndex
+ *
+ * returns -1 if none of the arguments are VectorIndexs
+ */
 
-    template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
-    struct GetTensorArgIdxExpanded;
+template<camp::idx_t DIM, typename ARGS, typename IDX_SEQ>
+struct GetTensorArgIdxExpanded;
 
-    template<camp::idx_t DIM, typename ... ARGS, camp::idx_t ... IDX>
-    struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>> {
+template<camp::idx_t DIM, typename... ARGS, camp::idx_t... IDX>
+struct GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::idx_seq<IDX...>>
+{
 
-        static constexpr camp::idx_t value =
-            RAJA::max<camp::idx_t>(
-                (internal::expt::isTensorIndex<ARGS>()&&internal::expt::getTensorDim<ARGS>()==DIM ? IDX : -1) ...);
-    };
+  static constexpr camp::idx_t value = RAJA::max<camp::idx_t>(
+      (internal::expt::isTensorIndex<ARGS>() &&
+               internal::expt::getTensorDim<ARGS>() == DIM
+           ? IDX
+           : -1)...);
+};
 
 
-  } // namespace detail
+}  // namespace detail
 #endif
 
 
-
-  /*
-   * Returns the number of arguments which are VectorIndexs
-   */
-  template<typename ... ARGS>
-  struct count_num_tensor_args{
-    static constexpr camp::idx_t value =
+/*
+ * Returns the number of arguments which are VectorIndexs
+ */
+template<typename... ARGS>
+struct count_num_tensor_args
+{
+  static constexpr camp::idx_t value =
 #if defined(RAJA_ENABLE_VECTORIZATION)
-        RAJA::sum<camp::idx_t>(
-            (internal::expt::isTensorIndex<ARGS>() ? 1 : 0) ...);
+      RAJA::sum<camp::idx_t>(
+          (internal::expt::isTensorIndex<ARGS>() ? 1 : 0)...);
 #else
-        0;  // There should be 0 Tensor indices if not vectorizing.
+      0;  // There should be 0 Tensor indices if not vectorizing.
 #endif
-  };
-  
+};
+
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Returns which argument has a vector index
-   */
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
-
-  template<camp::idx_t DIM, typename ... ARGS>
-  struct GetTensorArgIdx<DIM,camp::list<ARGS...>>{
-      static constexpr camp::idx_t value =
-          detail::GetTensorArgIdxExpanded<DIM, camp::list<ARGS...>, camp::make_idx_seq_t<sizeof...(ARGS)> >:: value;
-  };
+/*
+ * Returns which argument has a vector index
+ */
+template<camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  /*
-   * Returns the beginning index in a vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_begin(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorBegin<ARGS>(args, layout.template get_dim_begin<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
-  }
+template<camp::idx_t DIM, typename... ARGS>
+struct GetTensorArgIdx<DIM, camp::list<ARGS...>>
+{
+  static constexpr camp::idx_t value = detail::GetTensorArgIdxExpanded<
+      DIM,
+      camp::list<ARGS...>,
+      camp::make_idx_seq_t<sizeof...(ARGS)>>::value;
+};
 
-  /*
-   * Returns the number of elements in the vector argument
-   */
-  template<camp::idx_t DIM, typename LAYOUT, typename ... ARGS>
-  RAJA_INLINE
-  RAJA_HOST_DEVICE
-  static constexpr camp::idx_t get_tensor_args_size(LAYOUT const &layout, ARGS ... args){
-    return RAJA::max<camp::idx_t>(
-        internal::expt::getTensorDim<ARGS>()==DIM
-        ? internal::expt::getTensorSize<ARGS>(args, layout.template get_dim_size<GetTensorArgIdx<DIM, ARGS...>::value>())
-        : 0 ...);
-  }
+/*
+ * Returns the beginning index in a vector argument
+ */
+template<camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t get_tensor_args_begin(
+    LAYOUT const& layout,
+    ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorBegin<ARGS>(
+                args, layout.template get_dim_begin<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
+
+/*
+ * Returns the number of elements in the vector argument
+ */
+template<camp::idx_t DIM, typename LAYOUT, typename... ARGS>
+RAJA_INLINE RAJA_HOST_DEVICE static constexpr camp::idx_t get_tensor_args_size(
+    LAYOUT const& layout,
+    ARGS... args)
+{
+  return RAJA::max<camp::idx_t>(
+      internal::expt::getTensorDim<ARGS>() == DIM
+          ? internal::expt::getTensorSize<ARGS>(
+                args, layout.template get_dim_size<
+                          GetTensorArgIdx<DIM, ARGS...>::value>())
+          : 0 ...);
+}
 #endif
 
 
-  namespace detail {
+namespace detail
+{
 
-  /*!
-   * Provides conversion of view data to a return type.
-   *
-   * For scalars, this just returns the scalar.
-   *
-   * In the future development, this may return SIMD vectors or matrices using
-   * class specializations.
-   */
-  template<typename VecSeq, typename Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper;
+/*!
+ * Provides conversion of view data to a return type.
+ *
+ * For scalars, this just returns the scalar.
+ *
+ * In the future development, this may return SIMD vectors or matrices using
+ * class specializations.
+ */
+template<typename VecSeq,
+         typename Args,
+         typename ElementType,
+         typename PointerType,
+         typename LinIdx,
+         typename LayoutType>
+struct ViewReturnHelper;
+
+/*
+ * Specialization for Scalar return types
+ */
+template<typename... Args,
+         typename ElementType,
+         typename PointerType,
+         typename LinIdx,
+         typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
+  using return_type = ElementType&;
 
+  RAJA_INLINE
 
-  /*
-   * Specialization for Scalar return types
-   */
-  template<typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
   {
-      using return_type = ElementType &;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-        return data[stripIndexType(layout(args...))];
-      }
-  };
+    return data[stripIndexType(layout(args...))];
+  }
+};
 
 
 #if defined(RAJA_ENABLE_VECTORIZATION)
-  /*
-   * Specialization for Tensor return types
-   */
-  template<camp::idx_t VecHead, camp::idx_t ... VecSeq, typename ... Args, typename ElementType, typename PointerType, typename LinIdx, typename LayoutType>
-  struct ViewReturnHelper<camp::idx_seq<VecHead,VecSeq...>, camp::list<Args...>, ElementType, PointerType, LinIdx, LayoutType>
-  {
-
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,Args...>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-      using tensor_reg_type = typename camp::at_v<camp::list<Args...>, GetTensorArgIdx<0, Args...>::value>::tensor_type;
-      using ref_type = internal::expt::TensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE, s_num_dims, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, Args const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<Args>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          {
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecHead,Args...>::value>(),
-              (LinIdx)layout.template get_dim_stride<GetTensorArgIdx<VecSeq, Args...>::value>()...
-          },
-          // tile
-          {
-              // begin
-              {
-                  (LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
-                  (LinIdx)(get_tensor_args_begin<VecSeq> (layout, args...))...
-              },
-
-              // size
-              {
-                  (LinIdx)get_tensor_args_size<VecHead>(layout, args...),
-                  (LinIdx)get_tensor_args_size<VecSeq> (layout, args...)...
-              }
-          }
-        });
-      }
-  };
-
-
-
+/*
+ * Specialization for Tensor return types
+ */
+template<camp::idx_t VecHead,
+         camp::idx_t... VecSeq,
+         typename... Args,
+         typename ElementType,
+         typename PointerType,
+         typename LinIdx,
+         typename LayoutType>
+struct ViewReturnHelper<camp::idx_seq<VecHead, VecSeq...>,
+                        camp::list<Args...>,
+                        ElementType,
+                        PointerType,
+                        LinIdx,
+                        LayoutType>
+{
 
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, Args...>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, Args...>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using tensor_reg_type =
+      typename camp::at_v<camp::list<Args...>,
+                          GetTensorArgIdx<0, Args...>::value>::tensor_type;
+  using ref_type = internal::expt::TensorRef<ElementType*,
+                                             LinIdx,
+                                             internal::expt::TENSOR_MULTIPLE,
+                                             s_num_dims,
+                                             s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Specialization for Tensor return types and static layout types
-   */
-  template<
-      camp::idx_t VecHead, camp::idx_t ... VecSeq,
-      typename ... INDEX_TYPES,
-      typename ElementType, typename PointerType, typename LinIdx,
-      LinIdx... RangeInts, LinIdx... SizeInts, LinIdx... StrideInts,
-      typename DIM_LIST
-  >
-  struct ViewReturnHelper<
-      camp::idx_seq<VecHead,VecSeq...>,
-      camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
-      ElementType, PointerType,
-      LinIdx,
-      RAJA::detail::StaticLayoutBase_impl<
-          LinIdx,
-          camp::int_seq<LinIdx,RangeInts...>,
-          camp::int_seq<LinIdx,SizeInts...>,
-          camp::int_seq<LinIdx,StrideInts...>,
-          DIM_LIST
-      >
-  > {
-      static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
-
-      using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
-
-      using range_seq  = camp::int_seq<LinIdx,RangeInts... >;
-      using size_seq   = camp::int_seq<LinIdx,SizeInts...  >;
-      using stride_seq = camp::int_seq<LinIdx,StrideInts...>;
-      using LayoutType = RAJA::detail::StaticLayoutBase_impl<LinIdx,range_seq,size_seq,stride_seq,DIM_LIST>;
-
-      // This is the stride-one dimensions w.r.t. the tensor not the View
-      // For example:
-      //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
-      //  For a matrix, s_stride_one_dim is either:
-      //                 -1 neither row nor column are packed
-      //                 0 rows are stride-one
-      //                 1 columns are stride-one
-      static constexpr camp::idx_t s_stride_one_dim =
-          RAJA::max<camp::idx_t>(
-                  (GetTensorArgIdx<VecHead,index_list>::value == LayoutType::stride_one_dim ? VecHead : -1 ),
-                  (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim ? VecSeq  : -1 )...
-          );
-
-
-
-
-      using new_begin_seq = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_begin<VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_begin<VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-      using new_size_seq  = camp::int_seq<
-                LinIdx,
-                (LinIdx)get_tensor_args_size <VecHead>(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
-                (LinIdx)get_tensor_args_size <VecSeq >(LayoutType(), RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...
-            >;
-
-      using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
-      using new_size_type  = internal::expt::StaticIndexArray<new_size_seq >;
-
-
-      using tensor_reg_type = typename camp::at_v<index_list, GetTensorArgIdx<0, index_list>::value>::tensor_type;
-      using ref_type = internal::expt::StaticTensorRef<ElementType*, LinIdx, internal::expt::TENSOR_MULTIPLE,stride_seq,new_begin_seq,new_size_seq, s_stride_one_dim>;
-      using return_type = internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
-
-
-      RAJA_INLINE
-      RAJA_HOST_DEVICE
-      static
-      constexpr
-      return_type make_return(LayoutType const &layout, PointerType const &data, RAJA::expt::StaticTensorIndex<INDEX_TYPES> const &... args){
-
-        return return_type(ref_type{
-          // data pointer
-          &data[0] + layout(internal::expt::isTensorIndex<typename RAJA::expt::StaticTensorIndex<INDEX_TYPES>::base_type>() ? LinIdx{0} : (LinIdx)stripIndexType(internal::expt::stripTensorIndexByValue(args))...),
-          // strides
-          typename ref_type::stride_type(),
-          // tile
-          {
-              new_begin_type(),
-              new_size_type()
-          }
-        });
-      }
-  };
-#endif
+  RAJA_INLINE
 
+  RAJA_HOST_DEVICE
+  static constexpr return_type make_return(LayoutType const& layout,
+                                           PointerType const& data,
+                                           Args const&... args)
+  {
 
-  } // namespace detail
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<Args>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        {(LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecHead, Args...>::value>(),
+         (LinIdx)layout.template get_dim_stride<
+             GetTensorArgIdx<VecSeq, Args...>::value>()...},
+        // tile
+        {// begin
+         {(LinIdx)(get_tensor_args_begin<VecHead>(layout, args...)),
+          (LinIdx)(get_tensor_args_begin<VecSeq>(layout, args...))...},
+
+         // size
+         {(LinIdx)get_tensor_args_size<VecHead>(layout, args...),
+          (LinIdx)get_tensor_args_size<VecSeq>(layout, args...)...}}});
+  }
+};
 
+/*
+ * Specialization for Tensor return types and static layout types
+ */
+template<camp::idx_t VecHead,
+         camp::idx_t... VecSeq,
+         typename... INDEX_TYPES,
+         typename ElementType,
+         typename PointerType,
+         typename LinIdx,
+         LinIdx... RangeInts,
+         LinIdx... SizeInts,
+         LinIdx... StrideInts,
+         typename DIM_LIST>
+struct ViewReturnHelper<
+    camp::idx_seq<VecHead, VecSeq...>,
+    camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    RAJA::detail::StaticLayoutBase_impl<LinIdx,
+                                        camp::int_seq<LinIdx, RangeInts...>,
+                                        camp::int_seq<LinIdx, SizeInts...>,
+                                        camp::int_seq<LinIdx, StrideInts...>,
+                                        DIM_LIST>>
+{
+  static constexpr camp::idx_t s_num_dims = sizeof...(VecSeq) + 1;
+
+  using index_list = camp::list<RAJA::expt::StaticTensorIndex<INDEX_TYPES>...>;
+
+  using range_seq  = camp::int_seq<LinIdx, RangeInts...>;
+  using size_seq   = camp::int_seq<LinIdx, SizeInts...>;
+  using stride_seq = camp::int_seq<LinIdx, StrideInts...>;
+  using LayoutType = RAJA::detail::
+      StaticLayoutBase_impl<LinIdx, range_seq, size_seq, stride_seq, DIM_LIST>;
+
+  // This is the stride-one dimensions w.r.t. the tensor not the View
+  // For example:
+  //  For a vector, s_stride_one_dim is either 0 (packed) or -1 (strided)
+  //  For a matrix, s_stride_one_dim is either:
+  //                 -1 neither row nor column are packed
+  //                 0 rows are stride-one
+  //                 1 columns are stride-one
+  static constexpr camp::idx_t s_stride_one_dim = RAJA::max<camp::idx_t>(
+      (GetTensorArgIdx<VecHead, index_list>::value == LayoutType::stride_one_dim
+           ? VecHead
+           : -1),
+      (GetTensorArgIdx<VecSeq, index_list>::value == LayoutType::stride_one_dim
+           ? VecSeq
+           : -1)...);
+
+
+  using new_begin_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_begin<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_begin<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+  using new_size_seq =
+      camp::int_seq<LinIdx,
+                    (LinIdx)get_tensor_args_size<VecHead>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...),
+                    (LinIdx)get_tensor_args_size<VecSeq>(
+                        LayoutType(),
+                        RAJA::expt::StaticTensorIndex<INDEX_TYPES>()...)...>;
+
+  using new_begin_type = internal::expt::StaticIndexArray<new_begin_seq>;
+  using new_size_type  = internal::expt::StaticIndexArray<new_size_seq>;
+
+
+  using tensor_reg_type =
+      typename camp::at_v<index_list,
+                          GetTensorArgIdx<0, index_list>::value>::tensor_type;
+  using ref_type =
+      internal::expt::StaticTensorRef<ElementType*,
+                                      LinIdx,
+                                      internal::expt::TENSOR_MULTIPLE,
+                                      stride_seq,
+                                      new_begin_seq,
+                                      new_size_seq,
+                                      s_stride_one_dim>;
+  using return_type =
+      internal::expt::ET::TensorLoadStore<tensor_reg_type, ref_type>;
 
-  /*
-   * Computes the return type of a view.
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return type.
-   *
-   * Otherwise it produces the usual scalar reference return type
-   */
-  template<typename ElementType, typename PointerType, typename LinIdx, typename LayoutType, typename ... Args>
-  using view_return_type_t =
-      typename detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::return_type;
 
-  /*
-   * Creates the return value for a View
-   *
-   * If any of the arguments are a VectorIndex, it creates a VectorRef
-   * return value.
-   *
-   * Otherwise it produces the usual scalar reference return value
-   */
-  template<typename ElementType, typename LinIdx, typename LayoutType, typename PointerType, typename ... Args>
   RAJA_INLINE
+
   RAJA_HOST_DEVICE
-  constexpr
-  view_return_type_t<ElementType, PointerType, LinIdx, LayoutType, Args...>
-  view_make_return_value(LayoutType const &layout, PointerType const &data, Args const &... args){
-    return detail::ViewReturnHelper<
-        camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
-        camp::list<Args...>,
-        ElementType,
-        PointerType,
-        LinIdx,
-        LayoutType>::make_return(layout, data, args...);
+  static constexpr return_type make_return(
+      LayoutType const& layout,
+      PointerType const& data,
+      RAJA::expt::StaticTensorIndex<INDEX_TYPES> const&... args)
+  {
+
+    return return_type(ref_type {
+        // data pointer
+        &data[0] +
+            layout(internal::expt::isTensorIndex<
+                       typename RAJA::expt::StaticTensorIndex<
+                           INDEX_TYPES>::base_type>()
+                       ? LinIdx {0}
+                       : (LinIdx)stripIndexType(
+                             internal::expt::stripTensorIndexByValue(args))...),
+        // strides
+        typename ref_type::stride_type(),
+        // tile
+        {new_begin_type(), new_size_type()}});
   }
+};
+#endif
 
-  namespace detail
-  {
 
-  /**
-   * This class will help strip strongly typed indices
-   *
-   * This default implementation static_asserts that Expected==Arg, otherwise
-   * it's an error.  This enforces types for the TypedView.
-   *
-   * Specialization where expected type is same as argument type.
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg>
-  struct MatchTypedViewArgHelper{
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+}  // namespace detail
 
-    using type = strip_index_type_t<Arg>;
+/*
+ * Computes the return type of a view.
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return type.
+ *
+ * Otherwise it produces the usual scalar reference return type
+ */
+template<typename ElementType,
+         typename PointerType,
+         typename LinIdx,
+         typename LayoutType,
+         typename... Args>
+using view_return_type_t = typename detail::ViewReturnHelper<
+    camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+    camp::list<Args...>,
+    ElementType,
+    PointerType,
+    LinIdx,
+    LayoutType>::return_type;
+
+/*
+ * Creates the return value for a View
+ *
+ * If any of the arguments are a VectorIndex, it creates a VectorRef
+ * return value.
+ *
+ * Otherwise it produces the usual scalar reference return value
+ */
+template<typename ElementType,
+         typename LinIdx,
+         typename LayoutType,
+         typename PointerType,
+         typename... Args>
+RAJA_INLINE RAJA_HOST_DEVICE constexpr view_return_type_t<ElementType,
+                                                          PointerType,
+                                                          LinIdx,
+                                                          LayoutType,
+                                                          Args...>
+view_make_return_value(LayoutType const& layout,
+                       PointerType const& data,
+                       Args const&... args)
+{
+  return detail::ViewReturnHelper<
+      camp::make_idx_seq_t<count_num_tensor_args<Args...>::value>,
+      camp::list<Args...>, ElementType, PointerType, LinIdx,
+      LayoutType>::make_return(layout, data, args...);
+}
 
-    static RAJA_HOST_DEVICE RAJA_INLINE
-    constexpr
-    type extract(Arg arg){
-      return stripIndexType(arg);
-    }
-  };
+namespace detail
+{
 
+/**
+ * This class will help strip strongly typed indices
+ *
+ * This default implementation static_asserts that Expected==Arg, otherwise
+ * it's an error.  This enforces types for the TypedView.
+ *
+ * Specialization where expected type is same as argument type.
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template<typename Expected, typename Arg>
+struct MatchTypedViewArgHelper
+{
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
-#if defined(RAJA_ENABLE_VECTORIZATION)
-  /**
-   * Specialization where expected type is wrapped in a VectorIndex type
-   *
-   * In this case, there is no VectorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::TensorIndex<Arg, VectorType, DIM> >{
+  using type = strip_index_type_t<Arg>;
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  static RAJA_HOST_DEVICE RAJA_INLINE constexpr type extract(Arg arg)
+  {
+    return stripIndexType(arg);
+  }
+};
 
-    using arg_type = strip_index_type_t<Arg>;
 
-    using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
+#if defined(RAJA_ENABLE_VECTORIZATION)
+/**
+ * Specialization where expected type is wrapped in a VectorIndex type
+ *
+ * In this case, there is no VectorIndex to unpack, just strip any strongly
+ * typed indices.
+ */
+template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM>
+struct MatchTypedViewArgHelper<Expected,
+                               RAJA::expt::TensorIndex<Arg, VectorType, DIM>>
+{
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg){
-      return type(stripIndexType(*vec_arg), vec_arg.size());
-    }
-  };
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
-  /**
-   * Specialization where expected type is wrapped in a StaticTensorIndex type
-   *
-   * In this case, there is no StaticTensorIndex to unpack, just strip any strongly
-   * typed indices.
-   */
-  template<typename Expected, typename Arg, typename VectorType, camp::idx_t DIM, Arg BEGIN, strip_index_type_t<Arg> LENGTH>
-  struct MatchTypedViewArgHelper<Expected, RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> >{
+  using arg_type = strip_index_type_t<Arg>;
 
-    static_assert(std::is_convertible<strip_index_type_t<Arg>, strip_index_type_t<Expected>>::value,
-        "Argument isn't compatible");
+  using type = RAJA::expt::TensorIndex<arg_type, VectorType, DIM>;
 
-    using arg_type = strip_index_type_t<Arg>;
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type
+  extract(RAJA::expt::TensorIndex<Arg, VectorType, DIM> vec_arg)
+  {
+    return type(stripIndexType(*vec_arg), vec_arg.size());
+  }
+};
 
-    using type = RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
+/**
+ * Specialization where expected type is wrapped in a StaticTensorIndex type
+ *
+ * In this case, there is no StaticTensorIndex to unpack, just strip any
+ * strongly typed indices.
+ */
+template<typename Expected,
+         typename Arg,
+         typename VectorType,
+         camp::idx_t DIM,
+         Arg BEGIN,
+         strip_index_type_t<Arg> LENGTH>
+struct MatchTypedViewArgHelper<
+    Expected,
+    RAJA::expt::StaticTensorIndex<
+        RAJA::expt::
+            StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>>
+{
 
-    static constexpr RAJA_HOST_DEVICE RAJA_INLINE
-    type extract(RAJA::expt::StaticTensorIndex<RAJA::expt::StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>> RAJA_UNUSED_ARG(vec_arg)){
-      return type();
-    }
-  };
-#endif
+  static_assert(std::is_convertible<strip_index_type_t<Arg>,
+                                    strip_index_type_t<Expected>>::value,
+                "Argument isn't compatible");
 
-  } //namespace detail
+  using arg_type = strip_index_type_t<Arg>;
 
+  using type = RAJA::expt::StaticTensorIndex<
+      RAJA::expt::
+          StaticTensorIndexInner<arg_type, VectorType, DIM, BEGIN, LENGTH>>;
 
-  template<typename Expected, typename Arg>
-  RAJA_HOST_DEVICE
-  RAJA_INLINE
-  constexpr
-  typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
-  match_typed_view_arg(Arg const &arg)
+  static constexpr RAJA_HOST_DEVICE RAJA_INLINE type extract(
+      RAJA::expt::StaticTensorIndex<
+          RAJA::expt::
+              StaticTensorIndexInner<Arg, VectorType, DIM, BEGIN, LENGTH>>
+          RAJA_UNUSED_ARG(vec_arg))
   {
-    return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+    return type();
   }
+};
+#endif
 
+}  // namespace detail
 
+template<typename Expected, typename Arg>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr
+    typename detail::MatchTypedViewArgHelper<Expected, Arg>::type
+    match_typed_view_arg(Arg const& arg)
+{
+  return detail::MatchTypedViewArgHelper<Expected, Arg>::extract(arg);
+}
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType>
-class ViewBase {
+template<typename ValueType, typename PointerType, typename LayoutType>
+class ViewBase
+{
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
 
-    using Self = ViewBase<value_type, pointer_type, layout_type>;
-    using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
+  using Self         = ViewBase<value_type, pointer_type, layout_type>;
+  using NonConstView = ViewBase<nc_value_type, nc_pointer_type, layout_type>;
 
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView = ViewBase<value_type, pointer_type, shifted_layout_type>;
 
-  protected:
-    pointer_type m_data;
-    layout_type const m_layout;
+protected:
+  pointer_type m_data;
+  layout_type const m_layout;
 
-  public:
+public:
+  /*
+   * Defaulted operators (AJK):
+   *
+   * OpenMP Target currently needs the View classes to be trivially copyable,
+   * which means that we need to use the default ctor's and assignment
+   * operators.
+   *
+   * These defaulted operators cause issues with some versions of CUDA, so
+   * in the case that CUDA is enabled, we switch to explicitly defined
+   * operators.
+   */
+#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
+  RAJA_HOST_DEVICE
+  RAJA_INLINE
+  constexpr ViewBase() {};
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE ViewBase(ViewBase const& c)
+      : m_layout(c.m_layout),
+        m_data(c.m_data)
+  {}
 
-    /*
-     * Defaulted operators (AJK):
-     *
-     * OpenMP Target currently needs the View classes to be trivially copyable,
-     * which means that we need to use the default ctor's and assignment
-     * operators.
-     *
-     * These defaulted operators cause issues with some versions of CUDA, so
-     * in the case that CUDA is enabled, we switch to explicitly defined
-     * operators.
-     */
-#if (defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_CLANG_CUDA))
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr ViewBase(){};
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE ViewBase(ViewBase const &c)
-      : m_layout(c.m_layout), m_data(c.m_data)
-    {
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    ViewBase &operator=(ViewBase const &c)
-    {
-      m_layout = c.m_layout;
-      m_data = c.m_data;
-    }
+  RAJA_HOST_DEVICE
+
+  RAJA_INLINE
+  ViewBase& operator=(ViewBase const& c)
+  {
+    m_layout = c.m_layout;
+    m_data   = c.m_data;
+  }
 #else
-    constexpr ViewBase() = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase const &) = default;
-    RAJA_INLINE constexpr ViewBase(ViewBase &&) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase const &) = default;
-    RAJA_INLINE ViewBase& operator=(ViewBase &&) = default;
+  constexpr ViewBase()                             = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase const&)  = default;
+  RAJA_INLINE constexpr ViewBase(ViewBase&&)       = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase const&) = default;
+  RAJA_INLINE ViewBase& operator=(ViewBase&&)      = default;
 
 #endif
 
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, layout_type &&layout) :
-    m_data(data), m_layout(layout)
-    {
-    }
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(pointer_type data, Args... dim_sizes) :
-    m_data(data), m_layout(dim_sizes...)
-    {
-    }
-
-
-    template <bool IsConstView = std::is_const<value_type>::value>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    ViewBase(typename std::enable_if<IsConstView, NonConstView>::type const &rhs) :
-    m_data(rhs.get_data()), m_layout(rhs.get_layout())
-    {
-    }
-
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE void set_data(PointerType data_ptr){
-      m_data = data_ptr;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    pointer_type const &get_data() const
-    {
-      return m_data;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    layout_type const &get_layout() const
-    {
-      return m_layout;
-    }
-
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type size() const
-    {
-      return m_layout.size();
-    }
-
-
-    template<camp::idx_t DIM>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    linear_index_type get_dim_size() const
-    {
-      return m_layout.template get_dim_size<DIM>();
-    }
-
-
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(m_layout, m_data, args...);
-    }
-
-
-
-    template <size_t n_dims = layout_type::n_dims, typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
-
-      shifted_layout_type shift_layout(m_layout);
-      shift_layout.shift(shift);
-
-      return ShiftedView(m_data, shift_layout);
-    }
+  RAJA_HOST_DEVICE
 
-};
+  RAJA_INLINE
+  constexpr ViewBase(pointer_type data, layout_type&& layout)
+      : m_data(data),
+        m_layout(layout)
+  {}
+
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(pointer_type data,
+                                                  Args... dim_sizes)
+      : m_data(data),
+        m_layout(dim_sizes...)
+  {}
+
+  template<bool IsConstView = std::is_const<value_type>::value>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr ViewBase(
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
+      : m_data(rhs.get_data()),
+        m_layout(rhs.get_layout())
+  {}
 
+  RAJA_HOST_DEVICE
+  RAJA_INLINE void set_data(PointerType data_ptr) { m_data = data_ptr; }
 
-template <typename ValueType,
-        typename PointerType,
-        typename LayoutType,
-        typename IndexTypes>
-class TypedViewBase;
+  RAJA_HOST_DEVICE
 
-template <typename ValueType,
-          typename PointerType,
-          typename LayoutType,
-          typename... IndexTypes>
-class TypedViewBase<ValueType, PointerType, LayoutType, camp::list<IndexTypes...>> :
-  public ViewBase<ValueType, PointerType, LayoutType>
-{
+  RAJA_INLINE
+  constexpr pointer_type const& get_data() const { return m_data; }
 
-  public:
-    using value_type = ValueType;
-    using pointer_type = PointerType;
-    using layout_type = LayoutType;
-    using linear_index_type = typename layout_type::IndexLinear;
-    using nc_value_type = typename std::remove_const<value_type>::type;
-    using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
-        typename std::remove_pointer<pointer_type>::type>::type>::type;
+  RAJA_HOST_DEVICE
 
-    using Base = ViewBase<ValueType, PointerType, LayoutType>;
-    using Self = TypedViewBase<value_type, pointer_type, layout_type, camp::list<IndexTypes...> >;
-    using NonConstView = TypedViewBase<nc_value_type, nc_pointer_type, layout_type, camp::list<IndexTypes...> >;
+  RAJA_INLINE
+  constexpr layout_type const& get_layout() const { return m_layout; }
 
-    using shifted_layout_type = typename add_offset<layout_type>::type;
-    using ShiftedView = TypedViewBase<value_type, pointer_type, shifted_layout_type, camp::list<IndexTypes...> >;
+  RAJA_HOST_DEVICE
 
-    static constexpr size_t n_dims = sizeof...(IndexTypes);
+  RAJA_INLINE
+  constexpr linear_index_type size() const { return m_layout.size(); }
 
-    using Base::Base;
+  template<camp::idx_t DIM>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr linear_index_type get_dim_size() const
+  {
+    return m_layout.template get_dim_size<DIM>();
+  }
 
-    template <typename... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator()(Args... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        m_layout, m_data, args...);
+  }
 
+  template<size_t n_dims   = layout_type::n_dims,
+           typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
-    /*
-     * Compatibility note (AJK):
-     * We are using variadic arguments even though operator[] takes exactly 1 argument
-     * This gets around a template instantiation bug in CUDA/nvcc 9.1, which seems to have
-     * been fixed in CUDA 9.2+
-     */
-    template <typename ... Args>
-    RAJA_HOST_DEVICE
-    RAJA_INLINE
-    constexpr
-    view_return_type_t<value_type, pointer_type, linear_index_type, layout_type, Args...>
-    operator[](Args ... args) const
-    {
-      return view_make_return_value<value_type, linear_index_type>(Base::m_layout, Base::m_data, match_typed_view_arg<IndexTypes>(args)...);
-    }
+    shifted_layout_type shift_layout(m_layout);
+    shift_layout.shift(shift);
 
+    return ShiftedView(m_data, shift_layout);
+  }
+};
 
 
-    template <size_t n_dims = sizeof...(IndexTypes), typename IdxLin = linear_index_type>
-    RAJA_INLINE
-    ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
-    {
-      static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+template<typename ValueType,
+         typename PointerType,
+         typename LayoutType,
+         typename IndexTypes>
+class TypedViewBase;
 
-      shifted_layout_type shift_layout(Base::get_layout());
-      shift_layout.shift(shift);
+template<typename ValueType,
+         typename PointerType,
+         typename LayoutType,
+         typename... IndexTypes>
+class TypedViewBase<ValueType,
+                    PointerType,
+                    LayoutType,
+                    camp::list<IndexTypes...>>
+    : public ViewBase<ValueType, PointerType, LayoutType>
+{
 
-      return ShiftedView(Base::get_data(), shift_layout);
-    }
+public:
+  using value_type        = ValueType;
+  using pointer_type      = PointerType;
+  using layout_type       = LayoutType;
+  using linear_index_type = typename layout_type::IndexLinear;
+  using nc_value_type     = typename std::remove_const<value_type>::type;
+  using nc_pointer_type = typename std::add_pointer<typename std::remove_const<
+      typename std::remove_pointer<pointer_type>::type>::type>::type;
+
+  using Base         = ViewBase<ValueType, PointerType, LayoutType>;
+  using Self         = TypedViewBase<value_type,
+                             pointer_type,
+                             layout_type,
+                             camp::list<IndexTypes...>>;
+  using NonConstView = TypedViewBase<nc_value_type,
+                                     nc_pointer_type,
+                                     layout_type,
+                                     camp::list<IndexTypes...>>;
+
+  using shifted_layout_type = typename add_offset<layout_type>::type;
+  using ShiftedView         = TypedViewBase<value_type,
+                                    pointer_type,
+                                    shifted_layout_type,
+                                    camp::list<IndexTypes...>>;
+
+  static constexpr size_t n_dims = sizeof...(IndexTypes);
+
+  using Base::Base;
+
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator()(Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
-};
+  /*
+   * Compatibility note (AJK):
+   * We are using variadic arguments even though operator[] takes exactly 1
+   * argument This gets around a template instantiation bug in CUDA/nvcc 9.1,
+   * which seems to have been fixed in CUDA 9.2+
+   */
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr view_return_type_t<value_type,
+                                                            pointer_type,
+                                                            linear_index_type,
+                                                            layout_type,
+                                                            Args...>
+  operator[](Args... args) const
+  {
+    return view_make_return_value<value_type, linear_index_type>(
+        Base::m_layout, Base::m_data,
+        match_typed_view_arg<IndexTypes>(args)...);
+  }
 
+  template<size_t n_dims   = sizeof...(IndexTypes),
+           typename IdxLin = linear_index_type>
+  RAJA_INLINE ShiftedView shift(const std::array<IdxLin, n_dims>& shift)
+  {
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
+
+    shifted_layout_type shift_layout(Base::get_layout());
+    shift_layout.shift(shift);
+
+    return ShiftedView(Base::get_data(), shift_layout);
+  }
+};
 
 
-} // namespace internal
+}  // namespace internal
 
 }  // namespace RAJA
 
diff --git a/include/RAJA/util/View.hpp b/include/RAJA/util/View.hpp
index fcaee67f98..2b06af69d5 100644
--- a/include/RAJA/util/View.hpp
+++ b/include/RAJA/util/View.hpp
@@ -32,220 +32,205 @@
 namespace RAJA
 {
 
-//Helpers to convert
-//layouts -> OffsetLayouts
-//Typedlayouts -> TypedOffsetLayouts
+// Helpers to convert
+// layouts -> OffsetLayouts
+// Typedlayouts -> TypedOffsetLayouts
 template<typename layout>
 struct add_offset
 {
   using type = RAJA::OffsetLayout<layout::n_dims>;
 };
 
-template<typename IdxLin, typename...DimTypes>
-struct add_offset<RAJA::TypedLayout<IdxLin,camp::tuple<DimTypes...>>>
+template<typename IdxLin, typename... DimTypes>
+struct add_offset<RAJA::TypedLayout<IdxLin, camp::tuple<DimTypes...>>>
 {
-  using type = RAJA::TypedOffsetLayout<IdxLin,camp::tuple<DimTypes...>>;
+  using type = RAJA::TypedOffsetLayout<IdxLin, camp::tuple<DimTypes...>>;
 };
 
-template <typename ValueType,
-          typename LayoutType,
-          typename PointerType = ValueType *>
-using View =
-    internal::ViewBase<ValueType, PointerType, LayoutType>;
+template<typename ValueType,
+         typename LayoutType,
+         typename PointerType = ValueType*>
+using View = internal::ViewBase<ValueType, PointerType, LayoutType>;
 
 
+template<typename ValueType, typename LayoutType, typename... IndexTypes>
+using TypedView = internal::
+    TypedViewBase<ValueType, ValueType*, LayoutType, camp::list<IndexTypes...>>;
 
-template <typename ValueType, typename LayoutType, typename... IndexTypes>
-using TypedView =
-    internal::TypedViewBase<ValueType, ValueType *, LayoutType, camp::list<IndexTypes...> >;
-
-
-
-
-
-template <typename IndexType, typename ValueType>
-RAJA_INLINE View<ValueType, Layout<1, IndexType, 0> > make_view(
-    ValueType *ptr)
+template<typename IndexType, typename ValueType>
+RAJA_INLINE View<ValueType, Layout<1, IndexType, 0>> make_view(ValueType* ptr)
 {
-  return View<ValueType, Layout<1, IndexType, 0> >(ptr, 1);
+  return View<ValueType, Layout<1, IndexType, 0>>(ptr, 1);
 }
 
-template <size_t n_dims, typename IndexType, typename ValueType, typename... IndexTypes>
-RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> > make_index_view(
-    ValueType *ptr, IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
+template<size_t n_dims,
+         typename IndexType,
+         typename ValueType,
+         typename... IndexTypes>
+RAJA_INLINE View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>
+make_index_view(ValueType* ptr,
+                IndexLayout<n_dims, IndexType, IndexTypes...> index_layout)
 {
-  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...> >(ptr, index_layout);
+  return View<ValueType, IndexLayout<n_dims, IndexType, IndexTypes...>>(
+      ptr, index_layout);
 }
 
-
 // select certain indices from a tuple, given a curated index sequence
 // returns linear index of layout(ar...)
-template <typename Lay, typename Tup, camp::idx_t... Idxs>
-RAJA_HOST_DEVICE RAJA_INLINE 
-auto selecttuple( Lay lyout, Tup&& tup, camp::idx_seq<Idxs...> ) ->
-  decltype(
-            lyout(
-              camp::get<Idxs>(std::forward<Tup>(tup))...
-            )
-          )
-{ 
-  return lyout(
-                camp::get<Idxs>(std::forward<Tup>(tup))...
-              );
+template<typename Lay, typename Tup, camp::idx_t... Idxs>
+RAJA_HOST_DEVICE RAJA_INLINE auto selecttuple(Lay lyout,
+                                              Tup&& tup,
+                                              camp::idx_seq<Idxs...>)
+    -> decltype(lyout(camp::get<Idxs>(std::forward<Tup>(tup))...))
+{
+  return lyout(camp::get<Idxs>(std::forward<Tup>(tup))...);
 }
 
 // sequence combiner
-template <typename Seq1, typename Seq2>
+template<typename Seq1, typename Seq2>
 struct cat_seq;
 
-template <camp::idx_t... Idxs1, camp::idx_t... Idxs2>
-struct cat_seq  < camp::idx_seq<Idxs1...>,
-                  camp::idx_seq<Idxs2...>
-                >
+template<camp::idx_t... Idxs1, camp::idx_t... Idxs2>
+struct cat_seq<camp::idx_seq<Idxs1...>, camp::idx_seq<Idxs2...>>
 {
   using type = camp::idx_seq<Idxs1..., Idxs2...>;
 };
 
-template <typename Seq1, typename Seq2>
+template<typename Seq1, typename Seq2>
 using cat_seq_t = typename cat_seq<Seq1, Seq2>::type;
 
 // sequence offsetter
-template <camp::idx_t Offset, typename Seq>
+template<camp::idx_t Offset, typename Seq>
 struct offset_seq;
 
-template <camp::idx_t Offset, camp::idx_t... Idxs>
+template<camp::idx_t Offset, camp::idx_t... Idxs>
 struct offset_seq<Offset, camp::idx_seq<Idxs...>>
 {
-  using type = camp::idx_seq<(Idxs+Offset)...>;
+  using type = camp::idx_seq<(Idxs + Offset)...>;
 };
 
-template <camp::idx_t Offset, typename Seq>
+template<camp::idx_t Offset, typename Seq>
 using offset_seq_t = typename offset_seq<Offset, Seq>::type;
 
 // remove the Nth index in a parameter pack
 // returns linear index of layout(ar...)
-template <typename Lay, RAJA::Index_type Nth = 0, typename Tup>
-RAJA_HOST_DEVICE RAJA_INLINE auto removenth( Lay lyout, Tup&& tup ) ->
-  decltype( selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-            )
-          )
+template<typename Lay, RAJA::Index_type Nth = 0, typename Tup>
+RAJA_HOST_DEVICE RAJA_INLINE auto removenth(Lay lyout, Tup&& tup)
+    -> decltype(selecttuple<Lay>(
+        lyout,
+        std::forward<Tup>(tup),
+        cat_seq_t<
+            camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+            offset_seq_t<Nth + 1,       // after Nth
+                         camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                              Nth - 1>>  // sequence after Nth
+            > {}))
 {
   return selecttuple<Lay>(
-              lyout,
-              std::forward<Tup>(tup),
-              cat_seq_t<  camp::make_idx_seq_t<Nth>,  // sequence up to Nth
-                          offset_seq_t<
-                            Nth+1,  // after Nth
-                            camp::make_idx_seq_t<camp::tuple_size<Tup>::value - Nth-1>
-                          > // sequence after Nth
-                       >{}
-          );
+      lyout, std::forward<Tup>(tup),
+      cat_seq_t<camp::make_idx_seq_t<Nth>,  // sequence up to Nth
+                offset_seq_t<Nth + 1,       // after Nth
+                             camp::make_idx_seq_t<camp::tuple_size<Tup>::value -
+                                                  Nth - 1>>  // sequence after
+                                                             // Nth
+                > {});
 }
 
-
-
-
-// P2Pidx represents the array-of-pointers index. This allows the position of the
-// index into the array-of-pointers to be moved around in the MultiView operator();
-// see the operator overload.
-// Default of 0 means that the p2p index is in the 0th position.
-template <typename ValueType,
-          typename LayoutType,
-          RAJA::Index_type P2Pidx = 0,
-          typename PointerType = ValueType **,
-          typename NonConstPointerType =
-              camp::type::ptr::add< // adds *
-                camp::type::ptr::add<
-                  camp::type::cv::rem<  // removes cv
-                    camp::type::ptr::rem<
-                      camp::type::ptr::rem<PointerType>  // removes *
-                    >
-                  >
-                >
-              >
-          >
-struct MultiView {
-  using value_type = ValueType;
-  using pointer_type = PointerType;
-  using layout_type = LayoutType;
-  using nc_value_type = camp::decay<value_type>;
+// P2Pidx represents the array-of-pointers index. This allows the position of
+// the index into the array-of-pointers to be moved around in the MultiView
+// operator(); see the operator overload. Default of 0 means that the p2p index
+// is in the 0th position.
+template<typename ValueType,
+         typename LayoutType,
+         RAJA::Index_type P2Pidx      = 0,
+         typename PointerType         = ValueType**,
+         typename NonConstPointerType = camp::type::ptr::add<  // adds *
+             camp::type::ptr::add<camp::type::cv::rem<         // removes cv
+                 camp::type::ptr::rem<camp::type::ptr::rem<PointerType>  // removes
+                                                                         // *
+                                      >>>>>
+struct MultiView
+{
+  using value_type      = ValueType;
+  using pointer_type    = PointerType;
+  using layout_type     = LayoutType;
+  using nc_value_type   = camp::decay<value_type>;
   using nc_pointer_type = NonConstPointerType;
-  using NonConstView = MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
+  using NonConstView =
+      MultiView<nc_value_type, layout_type, P2Pidx, nc_pointer_type>;
 
   layout_type const layout;
   nc_pointer_type data;
 
-  template <typename... Args>
+  template<typename... Args>
   RAJA_INLINE constexpr MultiView(pointer_type data_ptr, Args... dim_sizes)
-      : layout(dim_sizes...), data(data_ptr)
-  {
-  }
+      : layout(dim_sizes...),
+        data(data_ptr)
+  {}
 
-  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type &&layout)
-      : layout(layout), data(data_ptr)
-  {
-  }
+  RAJA_INLINE constexpr MultiView(pointer_type data_ptr, layout_type&& layout)
+      : layout(layout),
+        data(data_ptr)
+  {}
 
-  RAJA_INLINE constexpr MultiView(MultiView const &) = default;
-  RAJA_INLINE constexpr MultiView(MultiView &&) = default;
-  RAJA_INLINE MultiView& operator=(MultiView const &) = default;
-  RAJA_INLINE MultiView& operator=(MultiView &&) = default;
+  RAJA_INLINE constexpr MultiView(MultiView const&)  = default;
+  RAJA_INLINE constexpr MultiView(MultiView&&)       = default;
+  RAJA_INLINE MultiView& operator=(MultiView const&) = default;
+  RAJA_INLINE MultiView& operator=(MultiView&&)      = default;
 
-  template <bool IsConstView = std::is_const<value_type>::value>
+  template<bool IsConstView = std::is_const<value_type>::value>
   RAJA_INLINE constexpr MultiView(
-      typename std::enable_if<IsConstView, NonConstView>::type const &rhs)
+      typename std::enable_if<IsConstView, NonConstView>::type const& rhs)
       : layout(rhs.layout),
         data(rhs.data)
-  {
-  }
+  {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { data = data_ptr; }
 
-  template <size_t n_dims=layout_type::n_dims, typename IdxLin = Index_type>
-  RAJA_INLINE RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
-  shift(const std::array<IdxLin, n_dims>& shift)
+  template<size_t n_dims = layout_type::n_dims, typename IdxLin = Index_type>
+  RAJA_INLINE RAJA::
+      MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>
+      shift(const std::array<IdxLin, n_dims>& shift)
   {
-    static_assert(n_dims==layout_type::n_dims, "Dimension mismatch in view shift");
+    static_assert(n_dims == layout_type::n_dims,
+                  "Dimension mismatch in view shift");
 
     typename add_offset<layout_type>::type shift_layout(layout);
     shift_layout.shift(shift);
 
-    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type, P2Pidx>(data, shift_layout);
+    return RAJA::MultiView<ValueType, typename add_offset<layout_type>::type,
+                           P2Pidx>(data, shift_layout);
   }
 
   // Moving the position of the index into the array-of-pointers
   // is set by P2Pidx, which is defaulted to 0.
   // making this specifically typed would require unpacking the layout,
   // this is easier to maintain
-  template <typename... Args>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(Args... ar) const
+  template<typename... Args>
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(Args... ar) const
   {
-    auto pidx = stripIndexType( camp::get<P2Pidx>( camp::forward_as_tuple( ar... ) ) );
+    auto pidx =
+        stripIndexType(camp::get<P2Pidx>(camp::forward_as_tuple(ar...)));
 
-    if ( pidx < 0 )
+    if (pidx < 0)
     {
-      RAJA_ABORT_OR_THROW( "Negative index while accessing array of pointers.\n" );
+      RAJA_ABORT_OR_THROW(
+          "Negative index while accessing array of pointers.\n");
     }
-    
-    auto idx = stripIndexType( removenth<LayoutType, P2Pidx>( layout, camp::forward_as_tuple( ar... ) ) );
+
+    auto idx = stripIndexType(
+        removenth<LayoutType, P2Pidx>(layout, camp::forward_as_tuple(ar...)));
     return data[pidx][idx];
   }
 };
 
-template <typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
-struct AtomicViewWrapper {
-  using base_type = ViewType;
+template<typename ViewType, typename AtomicPolicy = RAJA::auto_atomic>
+struct AtomicViewWrapper
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, AtomicPolicy>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, AtomicPolicy>;
 
   base_type base_;
 
@@ -254,43 +239,42 @@ struct AtomicViewWrapper {
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
-  template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS &&... args) const
+  template<typename... ARGS>
+  RAJA_HOST_DEVICE RAJA_INLINE atomic_type operator()(ARGS&&... args) const
   {
     return atomic_type(&base_.operator()(std::forward<ARGS>(args)...));
   }
 };
 
-
 /*
  * Specialized AtomicViewWrapper for seq_atomic that acts as pass-thru
  * for performance
  */
-template <typename ViewType>
-struct AtomicViewWrapper<ViewType, RAJA::seq_atomic> {
-  using base_type = ViewType;
+template<typename ViewType>
+struct AtomicViewWrapper<ViewType, RAJA::seq_atomic>
+{
+  using base_type    = ViewType;
   using pointer_type = typename base_type::pointer_type;
-  using value_type = typename base_type::value_type;
-  using atomic_type = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
+  using value_type   = typename base_type::value_type;
+  using atomic_type  = RAJA::AtomicRef<value_type, RAJA::seq_atomic>;
 
   base_type base_;
 
   RAJA_INLINE
-  constexpr explicit AtomicViewWrapper(ViewType const &view) : base_{view} {}
+  constexpr explicit AtomicViewWrapper(ViewType const& view) : base_ {view} {}
 
   RAJA_INLINE void set_data(pointer_type data_ptr) { base_.set_data(data_ptr); }
 
-  template <typename... ARGS>
-  RAJA_HOST_DEVICE RAJA_INLINE value_type &operator()(ARGS &&... args) const
+  template<typename... ARGS>
+  RAJA_HOST_DEVICE RAJA_INLINE value_type& operator()(ARGS&&... args) const
   {
     return base_.operator()(std::forward<ARGS>(args)...);
   }
 };
 
-
-template <typename AtomicPolicy, typename ViewType>
+template<typename AtomicPolicy, typename ViewType>
 RAJA_INLINE AtomicViewWrapper<ViewType, AtomicPolicy> make_atomic_view(
-    ViewType const &view)
+    ViewType const& view)
 {
 
   return RAJA::AtomicViewWrapper<ViewType, AtomicPolicy>(view);
diff --git a/include/RAJA/util/align.hpp b/include/RAJA/util/align.hpp
index 7103ecb152..23ccbee14c 100644
--- a/include/RAJA/util/align.hpp
+++ b/include/RAJA/util/align.hpp
@@ -32,16 +32,20 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
 {
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( disable : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(disable : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
   void* r = nullptr;
-  if (size <= space) {
+  if (size <= space)
+  {
     char* p1 = static_cast<char*>(ptr);
     char* p2 = reinterpret_cast<char*>(
-        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) & -alignment);
+        reinterpret_cast<size_t>(p1 + (static_cast<ptrdiff_t>(alignment) - 1)) &
+        -alignment);
     size_t d = static_cast<size_t>(p2 - p1);
-    if (d <= space - size) {
-      r = p2;
+    if (d <= space - size)
+    {
+      r   = p2;
       ptr = r;
       space -= d;
     }
@@ -49,9 +53,9 @@ void* align(size_t alignment, size_t size, void*& ptr, size_t& space)
   return r;
 
 #ifdef RAJA_COMPILER_MSVC
-#pragma warning( default : 4146 )  // Force msvc to ignore subtracting from signed number warning
+#pragma warning(default : 4146)  // Force msvc to ignore subtracting from signed
+                                 // number warning
 #endif
-
 }
 
 }  // end namespace RAJA
diff --git a/include/RAJA/util/basic_mempool.hpp b/include/RAJA/util/basic_mempool.hpp
index f0208ccbd3..4c0dcebc0a 100644
--- a/include/RAJA/util/basic_mempool.hpp
+++ b/include/RAJA/util/basic_mempool.hpp
@@ -54,27 +54,28 @@ namespace detail
 class MemoryArena
 {
 public:
-  using free_type = std::map<void*, void*>;
+  using free_type       = std::map<void*, void*>;
   using free_value_type = typename free_type::value_type;
-  using used_type = std::map<void*, void*>;
+  using used_type       = std::map<void*, void*>;
   using used_value_type = typename used_type::value_type;
 
   MemoryArena(void* ptr, size_t size)
-    : m_allocation{ ptr, static_cast<char*>(ptr)+size },
-      m_free_space(),
-      m_used_space()
+      : m_allocation {ptr, static_cast<char*>(ptr) + size},
+        m_free_space(),
+        m_used_space()
   {
-     m_free_space[ptr] = static_cast<char*>(ptr)+size ;
-    if (m_allocation.begin == nullptr) {
+    m_free_space[ptr] = static_cast<char*>(ptr) + size;
+    if (m_allocation.begin == nullptr)
+    {
       fprintf(stderr, "Attempt to create MemoryArena with no memory");
       std::abort();
     }
   }
 
-  MemoryArena(MemoryArena const&) = delete;
+  MemoryArena(MemoryArena const&)            = delete;
   MemoryArena& operator=(MemoryArena const&) = delete;
 
-  MemoryArena(MemoryArena&&) = default;
+  MemoryArena(MemoryArena&&)            = default;
   MemoryArena& operator=(MemoryArena&&) = default;
 
   size_t capacity()
@@ -90,21 +91,22 @@ class MemoryArena
   void* get(size_t nbytes, size_t alignment)
   {
     void* ptr_out = nullptr;
-    if (capacity() >= nbytes) {
+    if (capacity() >= nbytes)
+    {
       free_type::iterator end = m_free_space.end();
-      for (free_type::iterator iter = m_free_space.begin(); iter != end;
-           ++iter) {
+      for (free_type::iterator iter = m_free_space.begin(); iter != end; ++iter)
+      {
 
         void* adj_ptr = iter->first;
         size_t cap =
             static_cast<char*>(iter->second) - static_cast<char*>(adj_ptr);
 
-        if (::RAJA::align(alignment, nbytes, adj_ptr, cap)) {
+        if (::RAJA::align(alignment, nbytes, adj_ptr, cap))
+        {
 
           ptr_out = adj_ptr;
 
-          remove_free_chunk(iter,
-                            adj_ptr,
+          remove_free_chunk(iter, adj_ptr,
                             static_cast<char*>(adj_ptr) + nbytes);
 
           add_used_chunk(adj_ptr, static_cast<char*>(adj_ptr) + nbytes);
@@ -118,29 +120,35 @@ class MemoryArena
 
   bool give(void* ptr)
   {
-    if (m_allocation.begin <= ptr && ptr < m_allocation.end) {
+    if (m_allocation.begin <= ptr && ptr < m_allocation.end)
+    {
 
       used_type::iterator found = m_used_space.find(ptr);
 
-      if (found != m_used_space.end()) {
+      if (found != m_used_space.end())
+      {
 
         add_free_chunk(found->first, found->second);
 
         m_used_space.erase(found);
-
-      } else {
+      }
+      else
+      {
         fprintf(stderr, "Invalid free %p", ptr);
         std::abort();
       }
 
       return true;
-    } else {
+    }
+    else
+    {
       return false;
     }
   }
 
 private:
-  struct memory_chunk {
+  struct memory_chunk
+  {
     void* begin;
     void* end;
   };
@@ -152,19 +160,23 @@ class MemoryArena
     free_type::iterator next = m_free_space.lower_bound(begin);
 
     // check if prev exists
-    if (next != m_free_space.begin()) {
+    if (next != m_free_space.begin())
+    {
       // check if prev can cover [begin, end)
       free_type::iterator prev = next;
       --prev;
-      if (prev->second == begin) {
+      if (prev->second == begin)
+      {
         // extend prev to cover [begin, end)
         prev->second = end;
 
         // check if prev can cover next too
-        if (next != invl) {
+        if (next != invl)
+        {
           assert(next->first != begin);
 
-          if (next->first == end) {
+          if (next->first == end)
+          {
             // extend prev to cover next too
             prev->second = next->second;
 
@@ -176,12 +188,14 @@ class MemoryArena
       }
     }
 
-    if (next != invl) {
+    if (next != invl)
+    {
       assert(next->first != begin);
 
-      if (next->first == end) {
+      if (next->first == end)
+      {
         // extend next to cover [begin, end)
-        m_free_space.insert(next, free_value_type{begin, next->second});
+        m_free_space.insert(next, free_value_type {begin, next->second});
         m_free_space.erase(next);
 
         return;
@@ -190,38 +204,42 @@ class MemoryArena
 
     // no free space adjacent to this chunk, add seperate free chunk [begin,
     // end)
-    m_free_space.insert(next, free_value_type{begin, end});
+    m_free_space.insert(next, free_value_type {begin, end});
   }
 
   void remove_free_chunk(free_type::iterator iter, void* begin, void* end)
   {
 
-    void* ptr = iter->first;
+    void* ptr     = iter->first;
     void* ptr_end = iter->second;
 
     // fixup m_free_space, shrinking and adding chunks as needed
-    if (ptr != begin) {
+    if (ptr != begin)
+    {
 
       // shrink end of current free region to [ptr, begin)
       iter->second = begin;
 
-      if (end != ptr_end) {
+      if (end != ptr_end)
+      {
 
         // insert free region [end, ptr_end) after current free region
         free_type::iterator next = iter;
         ++next;
-        m_free_space.insert(next, free_value_type{end, ptr_end});
+        m_free_space.insert(next, free_value_type {end, ptr_end});
       }
-
-    } else if (end != ptr_end) {
+    }
+    else if (end != ptr_end)
+    {
 
       // shrink beginning of current free region to [end, ptr_end)
       free_type::iterator next = iter;
       ++next;
-      m_free_space.insert(next, free_value_type{end, ptr_end});
+      m_free_space.insert(next, free_value_type {end, ptr_end});
       m_free_space.erase(iter);
-
-    } else {
+    }
+    else
+    {
 
       // can not reuse current region, erase
       m_free_space.erase(iter);
@@ -231,7 +249,7 @@ class MemoryArena
   void add_used_chunk(void* begin, void* end)
   {
     // simply inserts a chunk of memory into used_space
-    m_used_space.insert(used_value_type{begin, end});
+    m_used_space.insert(used_value_type {begin, end});
   }
 
   memory_chunk m_allocation;
@@ -241,7 +259,6 @@ class MemoryArena
 
 } /* end namespace detail */
 
-
 /*! \class MemPool
  ******************************************************************************
  *
@@ -282,7 +299,7 @@ class MemoryArena
  *
  ******************************************************************************
  */
-template <typename allocator_t>
+template<typename allocator_t>
 class MemPool
 {
 public:
@@ -290,16 +307,17 @@ class MemPool
 
   static inline MemPool<allocator_t>& getInstance()
   {
-    static MemPool<allocator_t> pool{};
+    static MemPool<allocator_t> pool {};
     return pool;
   }
 
   static const size_t default_default_arena_size = 32ull * 1024ull * 1024ull;
 
   MemPool()
-      : m_arenas(), m_default_arena_size(default_default_arena_size), m_alloc()
-  {
-  }
+      : m_arenas(),
+        m_default_arena_size(default_default_arena_size),
+        m_alloc()
+  {}
 
   ~MemPool()
   {
@@ -308,7 +326,6 @@ class MemPool
     // So no more cuda calls here
   }
 
-
   /// Free all backing allocations, even if they are currently in use
   void free_chunks()
   {
@@ -316,7 +333,8 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    while (!m_arenas.empty()) {
+    while (!m_arenas.empty())
+    {
       void* allocation_ptr = m_arenas.front().get_allocation();
       m_alloc.free(allocation_ptr);
       m_arenas.pop_front();
@@ -338,34 +356,38 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    size_t prev_size = m_default_arena_size;
+    size_t prev_size     = m_default_arena_size;
     m_default_arena_size = new_size;
     return prev_size;
   }
 
-  template <typename T>
+  template<typename T>
   T* malloc(size_t nTs, size_t alignment = alignof(T))
   {
 #if defined(RAJA_ENABLE_OPENMP)
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    const size_t size = nTs * sizeof(T);
-    void* ptr = nullptr;
+    const size_t size                  = nTs * sizeof(T);
+    void* ptr                          = nullptr;
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
+         ++iter)
+    {
       ptr = iter->get(size, alignment);
-      if (ptr != nullptr) {
+      if (ptr != nullptr)
+      {
         break;
       }
     }
 
-    if (ptr == nullptr) {
+    if (ptr == nullptr)
+    {
       const size_t alloc_size =
           std::max(size + alignment, m_default_arena_size);
       void* arena_ptr = m_alloc.malloc(alloc_size);
-      if (arena_ptr != nullptr) {
+      if (arena_ptr != nullptr)
+      {
         m_arenas.emplace_front(arena_ptr, alloc_size);
         ptr = m_arenas.front().get(size, alignment);
       }
@@ -380,16 +402,19 @@ class MemPool
     lock_guard<omp::mutex> lock(m_mutex);
 #endif
 
-    void* ptr = const_cast<void*>(cptr);
+    void* ptr                          = const_cast<void*>(cptr);
     arena_container_type::iterator end = m_arenas.end();
     for (arena_container_type::iterator iter = m_arenas.begin(); iter != end;
-         ++iter) {
-      if (iter->give(ptr)) {
+         ++iter)
+    {
+      if (iter->give(ptr))
+      {
         ptr = nullptr;
         break;
       }
     }
-    if (ptr != nullptr) {
+    if (ptr != nullptr)
+    {
       fprintf(stderr, "Unknown pointer %p", ptr);
     }
   }
@@ -407,7 +432,8 @@ class MemPool
 };
 
 //! example allocator for basic_mempool using malloc/free
-struct generic_allocator {
+struct generic_allocator
+{
 
   // returns a valid pointer on success, nullptr on failure
   void* malloc(size_t nbytes) { return std::malloc(nbytes); }
diff --git a/include/RAJA/util/concepts.hpp b/include/RAJA/util/concepts.hpp
index 4372993949..fa8f71c27d 100644
--- a/include/RAJA/util/concepts.hpp
+++ b/include/RAJA/util/concepts.hpp
@@ -32,19 +32,19 @@ namespace concepts
 {
 using namespace camp::concepts;
 
-template <typename From, typename To>
+template<typename From, typename To>
 struct ConvertibleTo
-  : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>())) {
-};
+    : DefineConcept(::RAJA::concepts::convertible_to<To>(camp::val<From>()))
+{};
 
-}
+}  // namespace concepts
 
 namespace type_traits
 {
 using namespace camp::type_traits;
 
 DefineTypeTraitFromConcept(convertible_to, concepts::ConvertibleTo);
-}
+}  // namespace type_traits
 
 }  // end namespace RAJA
 
diff --git a/include/RAJA/util/for_each.hpp b/include/RAJA/util/for_each.hpp
index 25783b2a0a..2b907ed370 100644
--- a/include/RAJA/util/for_each.hpp
+++ b/include/RAJA/util/for_each.hpp
@@ -39,10 +39,12 @@ namespace detail
 // runtime loop applying func to each element in the range in order
 RAJA_SUPPRESS_HD_WARN
 template<typename Iter, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each(Iter begin,
+                                                Iter end,
+                                                UnaryFunc func)
 {
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
     func(*begin);
   }
 
@@ -51,12 +53,12 @@ UnaryFunc for_each(Iter begin, Iter end, UnaryFunc func)
 
 // compile time expansion applying func to a each type in the list in order
 RAJA_SUPPRESS_HD_WARN
-template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
+template<typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const&,
+                                                     UnaryFunc func)
 {
   // braced init lists are evaluated in order
-  int seq_unused_array[] = {0, (func(Ts{}), 0)...};
+  int seq_unused_array[] = {0, (func(Ts {}), 0)...};
   RAJA_UNUSED_VAR(seq_unused_array);
 
   return func;
@@ -64,9 +66,10 @@ UnaryFunc for_each_type(camp::list<Ts...> const&, UnaryFunc func)
 
 // compile time expansion applying func to a each type in the tuple in order
 RAJA_SUPPRESS_HD_WARN
-template <typename Tuple, typename UnaryFunc, camp::idx_t... Is>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
+template<typename Tuple, typename UnaryFunc, camp::idx_t... Is>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t,
+                                                      UnaryFunc func,
+                                                      camp::idx_seq<Is...>)
 {
   using camp::get;
   // braced init lists are evaluated in order
@@ -78,16 +81,15 @@ UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func, camp::idx_seq<Is...>)
 
 }  // namespace detail
 
-
 /*!
   \brief Apply func to all the elements in the given range in order
   using a sequential for loop in O(N) operations and O(1) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/for_each
 */
 RAJA_SUPPRESS_HD_WARN
-template <typename Container, typename UnaryFunc>
+template<typename Container, typename UnaryFunc>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
+    concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
     for_each(Container&& c, UnaryFunc func)
 {
   using std::begin;
@@ -101,24 +103,24 @@ concepts::enable_if_t<UnaryFunc, type_traits::is_range<Container>>
   using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
-template <typename UnaryFunc, typename... Ts>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_type(camp::list<Ts...> const& c, UnaryFunc func)
+template<typename UnaryFunc, typename... Ts>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_type(camp::list<Ts...> const& c,
+                                                     UnaryFunc func)
 {
   return detail::for_each_type(c, std::move(func));
 }
 
 /*!
-  \brief Apply func to each object in the given tuple or tuple like type in order
-  using a compile-time expansion in O(N) operations and O(1) extra memory
+  \brief Apply func to each object in the given tuple or tuple like type in
+  order using a compile-time expansion in O(N) operations and O(1) extra memory
 */
 RAJA_SUPPRESS_HD_WARN
-template <typename Tuple, typename UnaryFunc>
-RAJA_HOST_DEVICE RAJA_INLINE
-UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
+template<typename Tuple, typename UnaryFunc>
+RAJA_HOST_DEVICE RAJA_INLINE UnaryFunc for_each_tuple(Tuple&& t, UnaryFunc func)
 {
-  return detail::for_each_tuple(std::forward<Tuple>(t), std::move(func),
-      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value>{});
+  return detail::for_each_tuple(
+      std::forward<Tuple>(t), std::move(func),
+      camp::make_idx_seq_t<std::tuple_size<camp::decay<Tuple>>::value> {});
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/macros.hpp b/include/RAJA/util/macros.hpp
index 9ddb5bebb7..a4275bbf69 100644
--- a/include/RAJA/util/macros.hpp
+++ b/include/RAJA/util/macros.hpp
@@ -33,16 +33,16 @@
 // We need a better solution than this as it is a pain to manage
 // this stuff in an application.
 //
-#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) \
-  || (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) \
-  || (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
+#if (defined(RAJA_ENABLE_CUDA) && defined(__CUDA_ARCH__)) ||                   \
+    (defined(RAJA_ENABLE_HIP) && defined(__HIP_DEVICE_COMPILE__)) ||           \
+    (defined(RAJA_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__))
 #define RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE
 #endif
 
 #if defined(RAJA_ENABLE_CUDA) && defined(__CUDACC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 
 #if defined(RAJA_ENABLE_CLANG_CUDA)
 #define RAJA_SUPPRESS_HD_WARN
@@ -52,8 +52,8 @@
 
 #elif defined(RAJA_ENABLE_HIP) && defined(__HIPCC__)
 #define RAJA_HOST_DEVICE __host__ __device__
-#define RAJA_DEVICE __device__
-#define RAJA_HOST __host__
+#define RAJA_DEVICE      __device__
+#define RAJA_HOST        __host__
 #define RAJA_SUPPRESS_HD_WARN
 
 #define RAJA_USE_HIP_INTRINSICS
@@ -114,10 +114,9 @@
  * \endcode
  *******************************************************************************
  */
-template <typename... T>
-RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
-{
-}
+template<typename... T>
+RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T&&...) noexcept
+{}
 
 /*!
  * \def RAJA_STRINGIFY_HELPER(x)
@@ -133,7 +132,7 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  */
 #define RAJA_STRINGIFY_MACRO(x) RAJA_STRINGIFY_HELPER(x)
 
-#define RAJA_DIVIDE_CEILING_INT(dividend, divisor) \
+#define RAJA_DIVIDE_CEILING_INT(dividend, divisor)                             \
   (((dividend) + (divisor)-1) / (divisor))
 
 /*!
@@ -141,27 +140,26 @@ RAJA_HOST_DEVICE RAJA_INLINE void RAJA_UNUSED_VAR(T &&...) noexcept
  * Used in forall and launch
  */
 #if defined(RAJA_ENABLE_OPENMP)
-#define RAJA_OMP_DECLARE_REDUCTION_COMBINE \
-      _Pragma(" omp declare reduction( combine \
+#define RAJA_OMP_DECLARE_REDUCTION_COMBINE                                     \
+  _Pragma(" omp declare reduction( combine \
         : typename std::remove_reference<decltype(f_params)>::type \
-        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")\
-        //initializer(omp_priv = omp_in) ")
+        : RAJA::expt::ParamMultiplexer::combine<EXEC_POL>(omp_out, omp_in) ) ")  // initializer(omp_priv = omp_in) ")
 #endif
 
 
 RAJA_HOST_DEVICE
-inline void RAJA_ABORT_OR_THROW(const char *str)
+inline void RAJA_ABORT_OR_THROW(const char* str)
 {
 #if defined(__SYCL_DEVICE_ONLY__)
-  //segfault here ran into linking problems
-  *((volatile char *)0) = 0;  // write to address 0
+  // segfault here ran into linking problems
+  *((volatile char*)0) = 0;  // write to address 0
 #else
-  printf ( "%s\n", str );
+  printf("%s\n", str);
 #if defined(RAJA_ENABLE_TARGET_OPENMP) && (_OPENMP >= 201511)
   // seg faulting here instead of calling std::abort for omp target
-  *((volatile char *)0) = 0;  // write to address 0
+  *((volatile char*)0) = 0;  // write to address 0
 #elif defined(__CUDA_ARCH__)
-  asm ("trap;");
+  asm("trap;");
 
 #elif defined(__HIP_DEVICE_COMPILE__)
   abort();
@@ -169,10 +167,11 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #else
 #ifdef RAJA_COMPILER_MSVC
   fflush(stdout);
-  char *value;
+  char* value;
   size_t len;
   bool no_except = false;
-  if(_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr){
+  if (_dupenv_s(&value, &len, "RAJA_NO_EXCEPT") == 0 && value != nullptr)
+  {
     no_except = true;
     free(value);
   }
@@ -182,9 +181,12 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 #endif
 
   fflush(stdout);
-  if (no_except) {
+  if (no_except)
+  {
     std::abort();
-  } else {
+  }
+  else
+  {
     throw std::runtime_error(str);
   }
 #endif
@@ -202,7 +204,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
  */
 
 #if (__cplusplus >= 201402L)
-#define RAJA_HAS_CXX14 1
+#define RAJA_HAS_CXX14                    1
 #define RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED 1
 #elif defined(__has_cpp_attribute)
 #if __has_cpp_attribute(deprecated)
@@ -212,7 +214,7 @@ inline void RAJA_ABORT_OR_THROW(const char *str)
 
 #if defined(RAJA_HAS_CXX_ATTRIBUTE_DEPRECATED)
 // When using a C++14 compiler, use the standard-specified deprecated attribute
-#define RAJA_DEPRECATE(Msg) [[deprecated(Msg)]]
+#define RAJA_DEPRECATE(Msg)       [[deprecated(Msg)]]
 #define RAJA_DEPRECATE_ALIAS(Msg) [[deprecated(Msg)]]
 
 #elif defined(_MSC_VER)
diff --git a/include/RAJA/util/math.hpp b/include/RAJA/util/math.hpp
index 99d7bc192e..9bfb201024 100644
--- a/include/RAJA/util/math.hpp
+++ b/include/RAJA/util/math.hpp
@@ -34,14 +34,14 @@ namespace RAJA
     For zero or negative n return 0
 
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T log2(T n) noexcept
+template<typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T log2(T n) noexcept
 {
   T result = 0;
-  if (n > 0) {
-    while(n >>= 1) {
+  if (n > 0)
+  {
+    while (n >>= 1)
+    {
       ++result;
     }
   }
@@ -57,13 +57,12 @@ constexpr T log2(T n) noexcept
         if n is not a power of 2, return the next greater power of 2
       if n is negative, return 0
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T next_pow2(T n) noexcept
+template<typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T next_pow2(T n) noexcept
 {
   --n;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   ++n;
@@ -71,7 +70,8 @@ constexpr T next_pow2(T n) noexcept
 }
 
 /*!
-    \brief "round down" to the largest power of 2 that is less than or equal to n
+    \brief "round down" to the largest power of 2 that is less than or equal to
+   n
 
     For an integer n,
       if n is negative, return 0
@@ -79,13 +79,12 @@ constexpr T next_pow2(T n) noexcept
         if n is a power of 2, return n
         else return the largest power of 2 that is less than n
 */
-template < typename T,
-           std::enable_if_t<std::is_integral<T>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr T prev_pow2(T n) noexcept
+template<typename T, std::enable_if_t<std::is_integral<T>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr T prev_pow2(T n) noexcept
 {
-  if ( n < 0 ) return 0;
-  for (size_t s = 1; s < CHAR_BIT*sizeof(T); s *= 2) {
+  if (n < 0) return 0;
+  for (size_t s = 1; s < CHAR_BIT * sizeof(T); s *= 2)
+  {
     n |= n >> s;
   }
   return n - (n >> 1);
@@ -94,12 +93,14 @@ constexpr T prev_pow2(T n) noexcept
 /*!
     \brief compute lhs mod rhs where lhs is non-negative and rhs is a power of 2
 */
-template < typename L, typename R,
-           std::enable_if_t<std::is_integral<L>::value && std::is_integral<R>::value>* = nullptr >
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr auto power_of_2_mod(L lhs, R rhs) noexcept
+template<typename L,
+         typename R,
+         std::enable_if_t<std::is_integral<L>::value &&
+                          std::is_integral<R>::value>* = nullptr>
+RAJA_HOST_DEVICE RAJA_INLINE constexpr auto power_of_2_mod(L lhs,
+                                                           R rhs) noexcept
 {
-  return lhs & (rhs-R(1));
+  return lhs & (rhs - R(1));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/mutex.hpp b/include/RAJA/util/mutex.hpp
index a955b27915..bff1fe4a7e 100644
--- a/include/RAJA/util/mutex.hpp
+++ b/include/RAJA/util/mutex.hpp
@@ -39,10 +39,10 @@ class mutex
 
   mutex() { omp_init_lock(&m_lock); }
 
-  mutex(const mutex&) = delete;
-  mutex(mutex&&) = delete;
+  mutex(const mutex&)            = delete;
+  mutex(mutex&&)                 = delete;
   mutex& operator=(const mutex&) = delete;
-  mutex& operator=(mutex&&) = delete;
+  mutex& operator=(mutex&&)      = delete;
 
   void lock() { omp_set_lock(&m_lock); }
 
@@ -62,16 +62,16 @@ class mutex
 #endif  // closing endif for if defined(RAJA_ENABLE_OPENMP)
 
 //! class providing functionality of std::lock_guard
-template <typename mutex_type>
+template<typename mutex_type>
 class lock_guard
 {
 public:
   explicit lock_guard(mutex_type& m) : m_mutex(m) { m_mutex.lock(); }
 
-  lock_guard(const lock_guard&) = delete;
-  lock_guard(lock_guard&&) = delete;
+  lock_guard(const lock_guard&)            = delete;
+  lock_guard(lock_guard&&)                 = delete;
   lock_guard& operator=(const lock_guard&) = delete;
-  lock_guard& operator=(lock_guard&&) = delete;
+  lock_guard& operator=(lock_guard&&)      = delete;
 
   ~lock_guard() { m_mutex.unlock(); }
 
diff --git a/include/RAJA/util/plugins.hpp b/include/RAJA/util/plugins.hpp
index d5f42efde0..248a8e8d6b 100644
--- a/include/RAJA/util/plugins.hpp
+++ b/include/RAJA/util/plugins.hpp
@@ -18,103 +18,88 @@
 #include "RAJA/util/KokkosPluginLoader.hpp"
 #endif
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
-template <typename T>
-RAJA_INLINE auto trigger_updates_before(T&& item)
-  -> typename std::remove_reference<T>::type
+template<typename T>
+RAJA_INLINE auto trigger_updates_before(T&& item) ->
+    typename std::remove_reference<T>::type
 {
   return item;
 }
 
 RAJA_INLINE
-void
-callPreCapturePlugins(const PluginContext& p)
+void callPreCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostCapturePlugins(const PluginContext& p)
+void callPostCapturePlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postCapture(p);
   }
 }
 
 RAJA_INLINE
-void
-callPreLaunchPlugins(const PluginContext& p)
+void callPreLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->preLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callPostLaunchPlugins(const PluginContext& p)
+void callPostLaunchPlugins(const PluginContext& p)
 {
-  for (auto plugin = PluginRegistry::begin();
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->postLaunch(p);
   }
 }
 
 RAJA_INLINE
-void
-callInitPlugins(const PluginOptions p)
+void callInitPlugins(const PluginOptions p)
 {
-  for (auto plugin = PluginRegistry::begin(); 
-      plugin != PluginRegistry::end();
-      ++plugin)
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->init(p);
   }
 }
 
 RAJA_INLINE
-void
-init_plugins(const std::string& path)
-{   
+void init_plugins(const std::string& path)
+{
   callInitPlugins(make_options(path));
 }
 
 RAJA_INLINE
-void
-init_plugins()
-{   
-  callInitPlugins(make_options(""));
-}
+void init_plugins() { callInitPlugins(make_options("")); }
 
 RAJA_INLINE
-void
-finalize_plugins()
-{   
-  for (auto plugin = PluginRegistry::begin(); 
-    plugin != PluginRegistry::end();
-    ++plugin)
+void finalize_plugins()
+{
+  for (auto plugin = PluginRegistry::begin(); plugin != PluginRegistry::end();
+       ++plugin)
   {
     (*plugin).get()->finalize();
   }
 }
 
-} // closing brace for util namespace
-} // closing brace for RAJA namespace
+}  // namespace util
+}  // namespace RAJA
 
 #endif
diff --git a/include/RAJA/util/reduce.hpp b/include/RAJA/util/reduce.hpp
index 6d0c28f861..cb1cb4045e 100644
--- a/include/RAJA/util/reduce.hpp
+++ b/include/RAJA/util/reduce.hpp
@@ -41,31 +41,27 @@ namespace detail
 /*!
     \brief Reduce class that does a reduction with a left fold.
 */
-template <typename T, typename BinaryOp>
+template<typename T, typename BinaryOp>
 struct LeftFoldReduce
 {
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit LeftFoldReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
-    , m_accumulated_value(std::move(init))
-  {
-
-  }
-
-  LeftFoldReduce(LeftFoldReduce const&) = delete;
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit LeftFoldReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op)),
+        m_accumulated_value(std::move(init))
+  {}
+
+  LeftFoldReduce(LeftFoldReduce const&)            = delete;
   LeftFoldReduce& operator=(LeftFoldReduce const&) = delete;
-  LeftFoldReduce(LeftFoldReduce &&) = delete;
-  LeftFoldReduce& operator=(LeftFoldReduce &&) = delete;
+  LeftFoldReduce(LeftFoldReduce&&)                 = delete;
+  LeftFoldReduce& operator=(LeftFoldReduce&&)      = delete;
 
   ~LeftFoldReduce() = default;
 
-
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     m_accumulated_value = BinaryOp::identity();
   }
@@ -73,8 +69,7 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     T accumulated_value = std::move(m_accumulated_value);
 
@@ -86,17 +81,12 @@ struct LeftFoldReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
-  {
-    return m_accumulated_value;
-  }
+  RAJA_HOST_DEVICE RAJA_INLINE T get() { return m_accumulated_value; }
 
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T val)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T val)
   {
     m_accumulated_value = m_op(std::move(m_accumulated_value), std::move(val));
   }
@@ -109,50 +99,49 @@ struct LeftFoldReduce
 /*!
     \brief Reduce class that does a reduction with a binary tree.
 */
-template <typename T, typename BinaryOp, typename SizeType = size_t,
-          SizeType t_num_levels = CHAR_BIT*sizeof(SizeType)>
+template<typename T,
+         typename BinaryOp,
+         typename SizeType     = size_t,
+         SizeType t_num_levels = CHAR_BIT * sizeof(SizeType)>
 struct BinaryTreeReduce
 {
   static_assert(std::is_unsigned<SizeType>::value, "SizeType must be unsigned");
-  static_assert(t_num_levels <= CHAR_BIT*sizeof(SizeType), "SizeType must be large enough to act at a bitset for num_levels");
+  static_assert(
+      t_num_levels <= CHAR_BIT * sizeof(SizeType),
+      "SizeType must be large enough to act at a bitset for num_levels");
 
   static constexpr SizeType num_levels = t_num_levels;
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  constexpr explicit BinaryTreeReduce(T init = BinaryOp::identity(),
-                                      BinaryOp op = BinaryOp{}) noexcept
-    : m_op(std::move(op))
+  RAJA_HOST_DEVICE RAJA_INLINE constexpr explicit BinaryTreeReduce(
+      T init      = BinaryOp::identity(),
+      BinaryOp op = BinaryOp {}) noexcept
+      : m_op(std::move(op))
   {
     combine(std::move(init));
   }
 
-  BinaryTreeReduce(BinaryTreeReduce const&) = delete;
+  BinaryTreeReduce(BinaryTreeReduce const&)            = delete;
   BinaryTreeReduce& operator=(BinaryTreeReduce const&) = delete;
-  BinaryTreeReduce(BinaryTreeReduce &&) = delete;
-  BinaryTreeReduce& operator=(BinaryTreeReduce &&) = delete;
-
-  RAJA_HOST_DEVICE RAJA_INLINE
-  ~BinaryTreeReduce()
-  {
-    clear();
-  }
+  BinaryTreeReduce(BinaryTreeReduce&&)                 = delete;
+  BinaryTreeReduce& operator=(BinaryTreeReduce&&)      = delete;
 
+  RAJA_HOST_DEVICE RAJA_INLINE ~BinaryTreeReduce() { clear(); }
 
   /*!
       \brief reset the combined value of the reducer to the identity
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void clear() noexcept
+  RAJA_HOST_DEVICE RAJA_INLINE void clear() noexcept
   {
     // destroy all values on the tree stack and reset count to 0
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         get_value(level)->~T();
 
         m_count ^= mask;
-
       }
     }
   }
@@ -160,15 +149,16 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value and clear the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get_and_clear()
+  RAJA_HOST_DEVICE RAJA_INLINE T get_and_clear()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1) {
+    for (SizeType level = 0, mask = 1; m_count; ++level, mask <<= 1)
+    {
 
-      if (m_count & mask) {
+      if (m_count & mask)
+      {
 
         value = m_op(std::move(value), std::move(*get_value(level)));
         get_value(level)->~T();
@@ -183,15 +173,17 @@ struct BinaryTreeReduce
   /*!
       \brief return the combined value
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T get()
+  RAJA_HOST_DEVICE RAJA_INLINE T get()
   {
     // accumulate all values
     T value = BinaryOp::identity();
 
-    for (SizeType count = m_count, level = 0, mask = 1; count; ++level, mask <<= 1) {
+    for (SizeType count = m_count, level = 0, mask = 1; count;
+         ++level, mask <<= 1)
+    {
 
-      if (count & mask) {
+      if (count & mask)
+      {
 
         value = m_op(std::move(value), *get_value(level));
 
@@ -205,20 +197,19 @@ struct BinaryTreeReduce
   /*!
       \brief combine a value into the reducer
   */
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void combine(T value)
+  RAJA_HOST_DEVICE RAJA_INLINE void combine(T value)
   {
     // accumulate values and store in the first unused level found
     // clear values from used levels along the way
     SizeType level = 0;
-    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1) {
+    for (SizeType mask = 1; m_count & mask; ++level, mask <<= 1)
+    {
 
       value = m_op(std::move(*get_value(level)), std::move(value));
       get_value(level)->~T();
-
     }
 
-    new(get_storage(level)) T(std::move(value));
+    new (get_storage(level)) T(std::move(value));
 
     ++m_count;
   }
@@ -234,14 +225,12 @@ struct BinaryTreeReduce
   // values or is unused and has no value.
   std::aligned_storage_t<sizeof(T), alignof(T)> m_tree_stack[num_levels];
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  void* get_storage(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE void* get_storage(SizeType level)
   {
     return &m_tree_stack[level];
   }
 
-  RAJA_HOST_DEVICE RAJA_INLINE
-  T* get_value(SizeType level)
+  RAJA_HOST_DEVICE RAJA_INLINE T* get_value(SizeType level)
   {
 #if __cplusplus >= 201703L && !defined(RAJA_GPU_DEVICE_COMPILE_PASS_ACTIVE)
     // TODO: check that launder is supported in device code
@@ -252,31 +241,26 @@ struct BinaryTreeReduce
   }
 };
 
-
-template <typename T, typename BinaryOp>
-using HighAccuracyReduce = std::conditional_t<
-    RAJA::operators::is_fp_associative<T>::value,
-      BinaryTreeReduce<T, BinaryOp>,
-      LeftFoldReduce<T, BinaryOp>>;
-
+template<typename T, typename BinaryOp>
+using HighAccuracyReduce =
+    std::conditional_t<RAJA::operators::is_fp_associative<T>::value,
+                       BinaryTreeReduce<T, BinaryOp>,
+                       LeftFoldReduce<T, BinaryOp>>;
 
 /*!
     \brief Combine into a single value using a left fold with the given
            operation using O(N) operations and O(1) memory
 */
-template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T left_fold_reduce(Iter begin,
-                   Iter end,
-                   T init,
-                   BinaryOp op)
+template<typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE T
+left_fold_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   LeftFoldReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -289,44 +273,38 @@ T left_fold_reduce(Iter begin,
     This is more accurate than sequentially adding into a single value for
     floating point types.
 */
-template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T binary_tree_reduce(Iter begin,
-                     Iter end,
-                     T init,
-                     BinaryOp op)
+template<typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE T
+binary_tree_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   using std::distance;
   using SizeType = std::make_unsigned_t<decltype(distance(begin, end))>;
-  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init), std::move(op));
+  BinaryTreeReduce<T, BinaryOp, SizeType> reducer(std::move(init),
+                                                  std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
 }
 
-
 /*!
     \brief reducer that uses a high accuracy implementation when round-off error
     is a concern, or a faster algorithm with it is not a concern
 */
-template <typename Iter, typename T, typename BinaryOp>
-RAJA_HOST_DEVICE RAJA_INLINE
-T high_accuracy_reduce(Iter begin,
-                        Iter end,
-                        T init,
-                        BinaryOp op)
+template<typename Iter, typename T, typename BinaryOp>
+RAJA_HOST_DEVICE RAJA_INLINE T
+high_accuracy_reduce(Iter begin, Iter end, T init, BinaryOp op)
 {
   HighAccuracyReduce<T, BinaryOp> reducer(std::move(init), std::move(op));
 
-  for (; begin != end; ++begin) {
+  for (; begin != end; ++begin)
+  {
 
     reducer.combine(*begin);
-
   }
 
   return reducer.get_and_clear();
@@ -339,19 +317,22 @@ T high_accuracy_reduce(Iter begin,
   using a left fold algorithm in O(N) operations and O(1) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/accumulate
 */
-template <typename Container,
-          typename T = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template<typename Container,
+         typename T        = detail::ContainerVal<Container>,
+         typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    accumulate(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    accumulate(Container&& c,
+               T init      = BinaryOp::identity(),
+               BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::left_fold_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::left_fold_reduce(begin(c), end(c), std::move(init),
+                                  std::move(op));
 }
 
 /*!
@@ -359,19 +340,22 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
   using a binary tree algorithm in O(N) operations and O(lg(N)) extra memory
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <typename Container,
-          typename T = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template<typename Container,
+         typename T        = detail::ContainerVal<Container>,
+         typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    binary_tree_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    binary_tree_reduce(Container&& c,
+                       T init      = BinaryOp::identity(),
+                       BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::binary_tree_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::binary_tree_reduce(begin(c), end(c), std::move(init),
+                                    std::move(op));
 }
 
 /*!
@@ -380,19 +364,22 @@ concepts::enable_if_t<T, type_traits::is_range<Container>>
   concern
     see https://en.cppreference.com/w/cpp/algorithm/reduce
 */
-template <typename Container,
-          typename T = detail::ContainerVal<Container>,
-          typename BinaryOp = operators::plus<T>>
+template<typename Container,
+         typename T        = detail::ContainerVal<Container>,
+         typename BinaryOp = operators::plus<T>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if_t<T, type_traits::is_range<Container>>
-    high_accuracy_reduce(Container&& c, T init = BinaryOp::identity(), BinaryOp op = BinaryOp{})
+    concepts::enable_if_t<T, type_traits::is_range<Container>>
+    high_accuracy_reduce(Container&& c,
+                         T init      = BinaryOp::identity(),
+                         BinaryOp op = BinaryOp {})
 {
   using std::begin;
   using std::end;
   static_assert(type_traits::is_binary_function<BinaryOp, T, T, T>::value,
                 "BinaryOp must model BinaryFunction");
 
-  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init), std::move(op));
+  return detail::high_accuracy_reduce(begin(c), end(c), std::move(init),
+                                      std::move(op));
 }
 
 }  // namespace RAJA
diff --git a/include/RAJA/util/resource.hpp b/include/RAJA/util/resource.hpp
index 28a476d951..0ca490869b 100644
--- a/include/RAJA/util/resource.hpp
+++ b/include/RAJA/util/resource.hpp
@@ -37,145 +37,213 @@
 namespace RAJA
 {
 
-  namespace resources
-  {
-  using namespace camp::resources;
+namespace resources
+{
+using namespace camp::resources;
 
-  template<typename e>
-  struct get_resource{
-    using type = camp::resources::Host;
-  };
+template<typename e>
+struct get_resource
+{
+  using type = camp::resources::Host;
+};
 
-  template<Platform>
-  struct get_resource_from_platform{
-    using type = camp::resources::Host;
-  };
+template<Platform>
+struct get_resource_from_platform
+{
+  using type = camp::resources::Host;
+};
 
-  template<typename ExecPol>
-  using resource_from_pol_t = typename get_resource_from_platform<detail::get_platform<ExecPol>::value>::type;
+template<typename ExecPol>
+using resource_from_pol_t = typename get_resource_from_platform<
+    detail::get_platform<ExecPol>::value>::type;
 
-  template<typename ExecPol>
-  constexpr resource_from_pol_t<ExecPol> get_default_resource() {
-    return resource_from_pol_t<ExecPol>::get_default();
-  }
+template<typename ExecPol>
+constexpr resource_from_pol_t<ExecPol> get_default_resource()
+{
+  return resource_from_pol_t<ExecPol>::get_default();
+}
 
 #if defined(RAJA_CUDA_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::cuda>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template <bool Async, int num_threads, size_t BLOCKS_PER_SM>
-  struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async, num_threads, BLOCKS_PER_SM>>{
-    using type = camp::resources::Cuda;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, size_t BLOCKS_PER_SM, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping, IterationGetter, Concretizer, BLOCKS_PER_SM, Async>>>{
-    using type = camp::resources::Cuda;
-  };
+template<>
+struct get_resource_from_platform<Platform::cuda>
+{
+  using type = camp::resources::Cuda;
+};
+
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async>
+struct get_resource<::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                             IterationGetter,
+                                                             Concretizer,
+                                                             BLOCKS_PER_SM,
+                                                             Async>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template<bool Async, int num_threads, size_t BLOCKS_PER_SM>
+struct get_resource<::RAJA::policy::cuda::cuda_launch_explicit_t<Async,
+                                                                 num_threads,
+                                                                 BLOCKS_PER_SM>>
+{
+  using type = camp::resources::Cuda;
+};
+
+template<typename ISetIter,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         size_t BLOCKS_PER_SM,
+         bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter,
+               ::RAJA::policy::cuda::cuda_exec_explicit<IterationMapping,
+                                                        IterationGetter,
+                                                        Concretizer,
+                                                        BLOCKS_PER_SM,
+                                                        Async>>>
+{
+  using type = camp::resources::Cuda;
+};
 #endif
 
 #if defined(RAJA_HIP_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::hip>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>{
-    using type = camp::resources::Hip;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>{
-    using type = camp::resources::Hip;
-  };
-
-  template<typename ISetIter, typename IterationMapping, typename IterationGetter,
-           typename Concretizer, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::hip::hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>{
-    using type = camp::resources::Hip;
-  };
+template<>
+struct get_resource_from_platform<Platform::hip>
+{
+  using type = camp::resources::Hip;
+};
+
+template<typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async>
+struct get_resource<
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>
+{
+  using type = camp::resources::Hip;
+};
+
+template<bool Async, int num_threads>
+struct get_resource<::RAJA::policy::hip::hip_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Hip;
+};
+
+template<typename ISetIter,
+         typename IterationMapping,
+         typename IterationGetter,
+         typename Concretizer,
+         bool Async>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::hip::
+        hip_exec<IterationMapping, IterationGetter, Concretizer, Async>>>
+{
+  using type = camp::resources::Hip;
+};
 #endif
 
 #if defined(RAJA_SYCL_ACTIVE)
-  template<>
-  struct get_resource_from_platform<Platform::sycl>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<size_t BlockSize, bool Async>
-  struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template <bool Async, int num_threads>
-  struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>{
-    using type = camp::resources::Sycl;
-  };
-
-  template<typename ISetIter, size_t BlockSize, bool Async>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>{
-    using type = camp::resources::Sycl;
-  };
+template<>
+struct get_resource_from_platform<Platform::sycl>
+{
+  using type = camp::resources::Sycl;
+};
+
+template<size_t BlockSize, bool Async>
+struct get_resource<::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template<bool Async, int num_threads>
+struct get_resource<::RAJA::policy::sycl::sycl_launch_t<Async, num_threads>>
+{
+  using type = camp::resources::Sycl;
+};
+
+template<typename ISetIter, size_t BlockSize, bool Async>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::sycl::sycl_exec<BlockSize, Async>>>
+{
+  using type = camp::resources::Sycl;
+};
 #endif
 
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-  template<>
-  struct get_resource_from_platform<Platform::omp_target>{
-    using type = camp::resources::Omp;
-  };
-
-  template<>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>{
-    using type = camp::resources::Omp;
-  };
-
-  template<size_t ThreadsPerTeam>
-  struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>{
-    using type = camp::resources::Omp;
-  };
-
-  template<typename ISetIter, size_t ThreadsPerTeam>
-  struct get_resource<ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>{
-    using type = camp::resources::Omp;
-  };
+template<>
+struct get_resource_from_platform<Platform::omp_target>
+{
+  using type = camp::resources::Omp;
+};
+
+template<>
+struct get_resource<::RAJA::policy::omp::omp_target_parallel_for_exec_nt>
+{
+  using type = camp::resources::Omp;
+};
+
+template<size_t ThreadsPerTeam>
+struct get_resource<
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>
+{
+  using type = camp::resources::Omp;
+};
+
+template<typename ISetIter>
+struct get_resource<
+    ExecPolicy<ISetIter, ::RAJA::policy::omp::omp_target_parallel_for_exec_nt>>
+{
+  using type = camp::resources::Omp;
+};
+
+template<typename ISetIter, size_t ThreadsPerTeam>
+struct get_resource<ExecPolicy<
+    ISetIter,
+    ::RAJA::policy::omp::omp_target_parallel_for_exec<ThreadsPerTeam>>>
+{
+  using type = camp::resources::Omp;
+};
 #endif
 
-  } // end namespace resources
+}  // end namespace resources
+
+namespace type_traits
+{
+template<typename T>
+struct is_resource : std::false_type
+{};
 
-  namespace type_traits
-  {
-    template <typename T> struct is_resource : std::false_type {};
-    template <> struct is_resource<resources::Host> : std::true_type {};
+template<>
+struct is_resource<resources::Host> : std::true_type
+{};
 #if defined(RAJA_CUDA_ACTIVE)
-    template <> struct is_resource<resources::Cuda> : std::true_type {};
+template<>
+struct is_resource<resources::Cuda> : std::true_type
+{};
 #endif
 #if defined(RAJA_HIP_ACTIVE)
-    template <> struct is_resource<resources::Hip> : std::true_type {};
+template<>
+struct is_resource<resources::Hip> : std::true_type
+{};
 #endif
 #if defined(RAJA_SYCL_ACTIVE)
-    template <> struct is_resource<resources::Sycl> : std::true_type {};
+template<>
+struct is_resource<resources::Sycl> : std::true_type
+{};
 #endif
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
-    template <> struct is_resource<resources::Omp> : std::true_type {};
+template<>
+struct is_resource<resources::Omp> : std::true_type
+{};
 #endif
-  } // end namespace type_traits
+}  // end namespace type_traits
 
 }  // end namespace RAJA
 
-#endif //RAJA_resources_HPP#
+#endif  // RAJA_resources_HPP#
diff --git a/include/RAJA/util/sort.hpp b/include/RAJA/util/sort.hpp
index bbec03dfe1..e392e848f3 100644
--- a/include/RAJA/util/sort.hpp
+++ b/include/RAJA/util/sort.hpp
@@ -39,38 +39,42 @@ namespace detail
     \brief unstable partition given range inplace using predicate function
     and using O(N) predicate evaluations and O(1) memory
 */
-template <typename Iter, typename Predicate>
-RAJA_HOST_DEVICE RAJA_INLINE
-Iter
-partition(Iter begin,
-          Iter end,
-          Predicate pred)
+template<typename Iter, typename Predicate>
+RAJA_HOST_DEVICE RAJA_INLINE Iter partition(Iter begin,
+                                            Iter end,
+                                            Predicate pred)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return begin;
   }
 
   // advance to first false
   Iter first_false = begin;
-  for (; first_false != end; ++first_false) {
+  for (; first_false != end; ++first_false)
+  {
 
-    if (!pred(first_false)) {
+    if (!pred(first_false))
+    {
       break;
     }
   }
 
   // return if none were false
-  if (first_false == end) {
+  if (first_false == end)
+  {
     return first_false;
   }
 
   // advance through rest of list to find the next true
-  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true) {
+  for (Iter next_true = RAJA::next(first_false); next_true != end; ++next_true)
+  {
 
     // find the end of a range of falses [first_false, next_true)
-    if (pred(next_true)) {
+    if (pred(next_true))
+    {
 
       // shift the known range of falses forward
       // by swapping the true to the beginning of the range
@@ -86,34 +90,38 @@ partition(Iter begin,
     \brief stable insertion sort given range inplace using comparison function
     and using O(N^2) comparisons and O(1) memory
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-insertion_sort(Iter begin,
-               Iter end,
-               Compare comp)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE void insertion_sort(Iter begin,
+                                                 Iter end,
+                                                 Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if (begin == end) {
+  if (begin == end)
+  {
     return;
   }
 
   // for each unsorted item in the right side of the range
-  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end; ++next_unsorted) {
+  for (Iter next_unsorted = RAJA::next(begin); next_unsorted != end;
+       ++next_unsorted)
+  {
 
     // insert unsorted item into the sorted left side of the range
-    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert) {
+    for (Iter to_insert = next_unsorted; to_insert != begin; --to_insert)
+    {
 
       Iter next_sorted = RAJA::prev(to_insert);
 
       // compare with next item to left
-      if (comp(*to_insert, *next_sorted)) {
+      if (comp(*to_insert, *next_sorted))
+      {
 
         // swap down if should be before
         safe_iter_swap(next_sorted, to_insert);
-
-      } else {
+      }
+      else
+      {
 
         // stop if in correct position
         break;
@@ -125,20 +133,16 @@ insertion_sort(Iter begin,
 /*!
     \brief get number of strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr size_t num_shell_strides()
-{
-  return 39;
-}
+RAJA_HOST_DEVICE RAJA_INLINE constexpr size_t num_shell_strides() { return 39; }
 
 /*!
     \brief get strides for shell sort
 */
-RAJA_HOST_DEVICE RAJA_INLINE
-constexpr long long unsigned get_shell_stride(int i)
+RAJA_HOST_DEVICE RAJA_INLINE constexpr long long unsigned get_shell_stride(
+    int i)
 {
   using array_type = long long unsigned[num_shell_strides()];
-  return (array_type{
+  return (array_type {
       // strides from M. Ciura 2001
       1llu, 4llu, 10llu, 23llu, 57llu, 132llu, 301llu, 701llu, 1750llu,
       // extended up to 2^47 with strides[n] = floor(2.25*strides[n-1])
@@ -147,35 +151,36 @@ constexpr long long unsigned get_shell_stride(int i)
       149109795llu, 335497038llu, 754868335llu, 1698453753llu, 3821520944llu,
       8598422124llu, 19346449779llu, 43529512002llu, 97941402004llu,
       220368154509llu, 495828347645llu, 1115613782201llu, 2510131009952llu,
-      5647794772392llu, 12707538237882llu, 28591961035234llu, 64331912329276llu
-    })[i];
+      5647794772392llu, 12707538237882llu, 28591961035234llu,
+      64331912329276llu})[i];
 }
 
 /*!
     \brief unstable shell sort given range inplace using comparison function
     and using O(N^?) comparisons and O(1) memory
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-shell_sort(Iter begin,
-           Iter end,
-           Compare comp)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE void shell_sort(Iter begin, Iter end, Compare comp)
 {
   using ::RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
 
   diff_type n = end - begin;
 
-  if (n <= static_cast<diff_type>(1)) {
+  if (n <= static_cast<diff_type>(1))
+  {
     return;
-  } else if (get_shell_stride(1) < static_cast<unsigned long long>(n)) {
+  }
+  else if (get_shell_stride(1) < static_cast<unsigned long long>(n))
+  {
 
     int i_stride = 2;
     // find first stride larger than n
     constexpr int num_strides = num_shell_strides();
-    for (; i_stride < num_strides; ++i_stride) {
-      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n)) {
+    for (; i_stride < num_strides; ++i_stride)
+    {
+      if (get_shell_stride(i_stride) >= static_cast<unsigned long long>(n))
+      {
         break;
       }
     }
@@ -184,25 +189,32 @@ shell_sort(Iter begin,
 
     // for each stride size smaller than n, largest to smallest, not including 1
     // sort strided ranges with stride stride
-    for (; i_stride > 0; --i_stride) {
+    for (; i_stride > 0; --i_stride)
+    {
       diff_type stride = static_cast<diff_type>(get_shell_stride(i_stride));
 
       // for each unsorted item in the right side of each strided range
-      for (diff_type i_next_unsorted = stride; i_next_unsorted != n; ++i_next_unsorted) {
+      for (diff_type i_next_unsorted = stride; i_next_unsorted != n;
+           ++i_next_unsorted)
+      {
 
         // insert unsorted item into the sorted left side of the strided range
-        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride; i_to_insert -= stride) {
+        for (diff_type i_to_insert = i_next_unsorted; i_to_insert >= stride;
+             i_to_insert -= stride)
+        {
 
-          Iter to_insert = begin + i_to_insert;
+          Iter to_insert   = begin + i_to_insert;
           Iter next_sorted = to_insert - stride;
 
           // compare with next item to left
-          if (comp(*to_insert, *next_sorted)) {
+          if (comp(*to_insert, *next_sorted))
+          {
 
             // swap down if should be before
             safe_iter_swap(next_sorted, to_insert);
-
-          } else {
+          }
+          else
+          {
 
             // stop if in correct position
             break;
@@ -221,13 +233,11 @@ shell_sort(Iter begin,
     using comparison function
     and using O(lg(N)) comparisons and O(1) memory
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE RAJA_INLINE
-void
-heapify(Iter begin,
-        Iter root,
-        Iter end,
-        Compare comp)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE RAJA_INLINE void heapify(Iter begin,
+                                          Iter root,
+                                          Iter end,
+                                          Compare comp)
 {
   using RAJA::safe_iter_swap;
 
@@ -235,24 +245,28 @@ heapify(Iter begin,
 
   // heapify the root node into place
   // until this is a max heap again
-  for (auto i = root - begin; 2*i+1 < N; i = root - begin) {
+  for (auto i = root - begin; 2 * i + 1 < N; i = root - begin)
+  {
 
     // find the max item amongst the root, left child, and right child
     Iter maxit = root;
 
     // left child
-    Iter child = begin + 2*i+1;
-    if (comp(*maxit, *child)) {
+    Iter child = begin + 2 * i + 1;
+    if (comp(*maxit, *child))
+    {
       maxit = child;
     }
 
     // right child
     ++child;
-    if (child != end && comp(*maxit, *child)) {
+    if (child != end && comp(*maxit, *child))
+    {
       maxit = child;
     }
 
-    if (maxit == root) {
+    if (maxit == root)
+    {
       // root is the max, done
       break;
     }
@@ -268,25 +282,23 @@ heapify(Iter begin,
     \brief unstable heap sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(1) memory
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-heap_sort(Iter begin,
-          Iter end,
-          Compare comp)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE inline void heap_sort(Iter begin, Iter end, Compare comp)
 {
   using RAJA::safe_iter_swap;
 
   auto N = end - begin;
 
-  if (N < 2) {
+  if (N < 2)
+  {
     // already sorted
     return;
   }
 
   // make range into a max heap by
   // going through nodes with children one-by-one in reverse order
-  for (Iter root = begin + (N-1)/2; root != begin; --root) {
+  for (Iter root = begin + (N - 1) / 2; root != begin; --root)
+  {
     // heapify a sub-heap
     heapify(begin, root, end, comp);
   }
@@ -294,7 +306,8 @@ heap_sort(Iter begin,
   heapify(begin, begin, end, comp);
 
   // remove one element from max heap repeatedly until sorted
-  for (--end; begin != end; --end) {
+  for (--end; begin != end; --end)
+  {
 
     // swap max element into sorted position at end of heap
     safe_iter_swap(begin, end);
@@ -324,13 +337,11 @@ struct intro_sort_insertion_sort_cutoff
     \brief unstable intro sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(lg(N)) memory, with limited depth.
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort_depth(Iter begin,
-                 Iter end,
-                 Compare comp,
-                 unsigned depth)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE inline void intro_sort_depth(Iter begin,
+                                              Iter end,
+                                              Compare comp,
+                                              unsigned depth)
 {
   using RAJA::safe_iter_swap;
   using diff_type = ::RAJA::detail::IterDiff<Iter>;
@@ -341,57 +352,58 @@ intro_sort_depth(Iter begin,
   constexpr diff_type insertion_sort_cutoff =
       static_cast<diff_type>(intro_sort_insertion_sort_cutoff::get());
 
-  if (N < 2) {
+  if (N < 2)
+  {
 
     // already sorted
-
-  } else if (N < insertion_sort_cutoff) {
+  }
+  else if (N < insertion_sort_cutoff)
+  {
 
     // use insertion sort for small inputs
     detail::insertion_sort(begin, end, comp);
-
-  } else if (depth == 0) {
+  }
+  else if (depth == 0)
+  {
 
     // use heap sort if recurse too deep
     detail::heap_sort(begin, end, comp);
-
-  } else {
+  }
+  else
+  {
 
     // use quick sort
     // choose pivot with median of 3 (N >= insertion_sort_cutoff)
-    Iter mid = begin + N/2;
-    Iter last = end-1;
-    Iter pivot = comp(*begin, *mid)
-                    ? ( comp(*mid, *last)
-                           ? mid
-                           : ( comp(*begin, *last)
-                                  ? last
-                                  : begin ) )
-                    : ( comp(*mid, *last)
-                           ? ( comp(*begin, *last)
-                                  ? begin
-                                  : last )
-                           : mid );
+    Iter mid  = begin + N / 2;
+    Iter last = end - 1;
+    Iter pivot =
+        comp(*begin, *mid)
+            ? (comp(*mid, *last) ? mid : (comp(*begin, *last) ? last : begin))
+            : (comp(*mid, *last) ? (comp(*begin, *last) ? begin : last) : mid);
 
     // swap pivot to last
-    if (pivot != last) {
+    if (pivot != last)
+    {
       safe_iter_swap(pivot, last);
       pivot = last;
     }
 
     // partition
-    mid = partition(begin, last, [&](Iter it){ return comp(*it, *pivot); });
+    mid = partition(begin, last, [&](Iter it) {
+      return comp(*it, *pivot);
+    });
 
     // swap pivot to sorted position
-    if (mid != pivot) {
+    if (mid != pivot)
+    {
       safe_iter_swap(mid, pivot);
       pivot = mid;
     }
 
     // recurse to sort first and second parts, ignoring already sorted pivot
     // by construction pivot is always in the range [begin, last]
-    detail::intro_sort_depth(begin, pivot, comp, depth-1);
-    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth-1);
+    detail::intro_sort_depth(begin, pivot, comp, depth - 1);
+    detail::intro_sort_depth(RAJA::next(pivot), end, comp, depth - 1);
   }
 }
 
@@ -399,21 +411,19 @@ intro_sort_depth(Iter begin,
     \brief unstable intro sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
-template <typename Iter, typename Compare>
-RAJA_HOST_DEVICE inline
-void
-intro_sort(Iter begin,
-           Iter end,
-           Compare comp)
+template<typename Iter, typename Compare>
+RAJA_HOST_DEVICE inline void intro_sort(Iter begin, Iter end, Compare comp)
 {
   auto N = end - begin;
 
   // set max depth to 2*lg(N)
-  unsigned max_depth = 2*RAJA::log2(N);
+  unsigned max_depth = 2 * RAJA::log2(N);
 
 #if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
-  // limit max_depth statically in device code to allow compiler to remove recursion
-  if (max_depth > detail::intro_sort_device_max_depth::get()) {
+  // limit max_depth statically in device code to allow compiler to remove
+  // recursion
+  if (max_depth > detail::intro_sort_device_max_depth::get())
+  {
     max_depth = detail::intro_sort_device_max_depth::get();
   }
 #endif
@@ -425,26 +435,21 @@ intro_sort(Iter begin,
     \brief merge a range with midpoint using comparison function
     with local range/2 copy
 */
-template <typename Iter, typename Compare>
-void
-RAJA_INLINE
-inplace_merge(  Iter first,
-                Iter middle,
-                Iter last,
-                Compare comp  )
+template<typename Iter, typename Compare>
+void RAJA_INLINE inplace_merge(Iter first, Iter middle, Iter last, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   diff_type copylen = middle - first;
 
-  if ( first == middle || middle == last )
+  if (first == middle || middle == last)
   {
     // at least one side empty, already sorted
     return;
   }
 
-  if ( !comp(*middle, *(middle-1)) )
+  if (!comp(*middle, *(middle - 1)))
   {
     // everything already in order, done
     return;
@@ -455,37 +460,39 @@ inplace_merge(  Iter first,
   buf_deleter_type buf_deleter;
 
   std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-      RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, copylen * sizeof(value_type) ),
+      RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                              copylen * sizeof(value_type)),
       buf_deleter);
 
   value_type* copyarr = copy_buf.get();
 
   // check memory allocation worked
-  if (copyarr == nullptr) {
-    RAJA_ABORT_OR_THROW( "inplace_merge temporary memory allocation failed" );
+  if (copyarr == nullptr)
+  {
+    RAJA_ABORT_OR_THROW("inplace_merge temporary memory allocation failed");
   }
 
   // move construct input into buffer storage
   // use buf_deleter.size as index to keep track of objects constructed
-  for ( diff_type& cc = buf_deleter.size; cc < copylen; ++cc )
+  for (diff_type& cc = buf_deleter.size; cc < copylen; ++cc)
   {
-    new(&copyarr[cc]) value_type(std::move(first[cc]));
+    new (&copyarr[cc]) value_type(std::move(first[cc]));
   }
 
   // merge
-  for ( diff_type cur = 0; cur < copylen; )
+  for (diff_type cur = 0; cur < copylen;)
   {
-    if ( middle >= last ) // moved all second half, put copy into remainder
+    if (middle >= last)  // moved all second half, put copy into remainder
     {
-      std::move( copyarr+cur, copyarr+copylen, first );
+      std::move(copyarr + cur, copyarr + copylen, first);
       break;
     }
-    else if ( first == middle ) // everything prior to middle is sorted, done
+    else if (first == middle)  // everything prior to middle is sorted, done
     {
       break;
     }
 
-    if ( comp(*middle, copyarr[cur]) )
+    if (comp(*middle, copyarr[cur]))
     {
       *first = std::move(*middle);
       ++middle;
@@ -504,48 +511,47 @@ inplace_merge(  Iter first,
     \brief merge given two ranges using comparison function
     while copies are outside, somewhat follows STL API
 */
-template <typename Iter1, typename Iter2, typename OutIter, typename Compare>
-//constexpr OutIter // <-- std:: return value
-void
-RAJA_INLINE
-merge_like_std( Iter1 first1,
-                Iter1 last1,
-                Iter2 first2,
-                Iter2 last2,
-                OutIter d_first,  // using this as direct access to result
-                Compare comp)
+template<typename Iter1, typename Iter2, typename OutIter, typename Compare>
+// constexpr OutIter // <-- std:: return value
+void RAJA_INLINE
+merge_like_std(Iter1 first1,
+               Iter1 last1,
+               Iter2 first2,
+               Iter2 last2,
+               OutIter d_first,  // using this as direct access to result
+               Compare comp)
 {
   using ::RAJA::safe_iter_swap;
 
-  if ( first1 == last2 - 1 )  // should never need to do this
+  if (first1 == last2 - 1)  // should never need to do this
   {
     return;
   }
 
-  if ( (last2 - first1) == 2 ) // only 2 elements, simple swap
+  if ((last2 - first1) == 2)  // only 2 elements, simple swap
   {
-    if ( !comp(*d_first, *(d_first+1)) )
+    if (!comp(*d_first, *(d_first + 1)))
     {
-      safe_iter_swap( d_first, d_first+1 );
+      safe_iter_swap(d_first, d_first + 1);
     }
     return;
   }
 
-  while ( first1 < last1 || first2 < last2 )
+  while (first1 < last1 || first2 < last2)
   {
-    if ( first1 >= last1 ) // first half done
+    if (first1 >= last1)  // first half done
     {
       *d_first = std::move(*first2);
       ++first2;
     }
-    else if ( first2 >= last2 )  // second half done
+    else if (first2 >= last2)  // second half done
     {
       *d_first = std::move(*first1);
       ++first1;
     }
     else  // neither half done
     {
-      if ( comp( *first2, *first1 ) )
+      if (comp(*first2, *first1))
       {
         *d_first = std::move(*first2);
         ++first2;
@@ -567,35 +573,33 @@ merge_like_std( Iter1 first1,
     \brief stable merge sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(N) memory
 */
-template <typename Iter, typename Compare>
-RAJA_INLINE
-void
-merge_sort(Iter begin,
-           Iter end,
-           Compare comp)
+template<typename Iter, typename Compare>
+RAJA_INLINE void merge_sort(Iter begin, Iter end, Compare comp)
 {
-  using diff_type = RAJA::detail::IterDiff<Iter>;
+  using diff_type  = RAJA::detail::IterDiff<Iter>;
   using value_type = RAJA::detail::IterVal<Iter>;
 
   // iterative mergesort (bottom up) for future parallelism
 
   // min helper
-  auto minlam = [] (diff_type a, diff_type b) {return (a < b) ? a : b;};
+  auto minlam = [](diff_type a, diff_type b) {
+    return (a < b) ? a : b;
+  };
 
   // insertion sort for sizes <= 16
-  diff_type len = end - begin;
+  diff_type len                                    = end - begin;
   static constexpr diff_type insertion_sort_cutoff = 16;
-  if ( len <= insertion_sort_cutoff && len > 0 )
+  if (len <= insertion_sort_cutoff && len > 0)
   {
-    detail::insertion_sort( begin, end, comp );
+    detail::insertion_sort(begin, end, comp);
   }
   else
   {
     // insertion sort on 16-element chunks, then merge
-    for ( diff_type start = 0; start < len; start += insertion_sort_cutoff )
+    for (diff_type start = 0; start < len; start += insertion_sort_cutoff)
     {
-      diff_type lastchunk = minlam( insertion_sort_cutoff, len - start );
-      detail::insertion_sort( begin + start, begin + start + lastchunk, comp );
+      diff_type lastchunk = minlam(insertion_sort_cutoff, len - start);
+      detail::insertion_sort(begin + start, begin + start + lastchunk, comp);
     }
 
     // merge using extra storage
@@ -605,74 +609,86 @@ merge_sort(Iter begin,
     buf_deleter_type buf_deleter;
 
     std::unique_ptr<value_type, buf_deleter_type&> copy_buf(
-        RAJA::allocate_aligned_type<value_type>( RAJA::DATA_ALIGN, len * sizeof(value_type) ),
+        RAJA::allocate_aligned_type<value_type>(RAJA::DATA_ALIGN,
+                                                len * sizeof(value_type)),
         buf_deleter);
 
     value_type* copyarr = copy_buf.get();
 
     // check memory allocation worked
-    if (copyarr == nullptr) {
-      RAJA_ABORT_OR_THROW( "merge_sort temporary memory allocation failed" );
+    if (copyarr == nullptr)
+    {
+      RAJA_ABORT_OR_THROW("merge_sort temporary memory allocation failed");
     }
 
     // move construct input into buffer storage
     // use buf_deleter.size as index to keep track of objects constructed
-    for ( diff_type& cc = buf_deleter.size; cc < len; ++cc )
+    for (diff_type& cc = buf_deleter.size; cc < len; ++cc)
     {
-      new(&copyarr[cc]) value_type(std::move(begin[cc]));
+      new (&copyarr[cc]) value_type(std::move(begin[cc]));
     }
 
     bool copyvalid = true;
-    //for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log n) loop
-    for ( diff_type midpoint = 16; midpoint < len; midpoint *= 2 )  // O(log n) loop
+    // for ( diff_type midpoint = 1; midpoint < len; midpoint *= 2 )  // O(log
+    // n) loop
+    for (diff_type midpoint = 16; midpoint < len;
+         midpoint *= 2)  // O(log n) loop
     {
-      for ( diff_type start = 0; start < len; start += midpoint * 2 )  // O(n) merging loop (can be parallelized)
+      for (diff_type start = 0; start < len;
+           start += midpoint * 2)  // O(n) merging loop (can be parallelized)
       {
-        diff_type finish = minlam( start + midpoint * 2, len );
-        if ( finish > len )
+        diff_type finish = minlam(start + midpoint * 2, len);
+        if (finish > len)
         {
-          RAJA_ABORT_OR_THROW( "merge_sort invalid finish point" );  // sanity check
+          RAJA_ABORT_OR_THROW(
+              "merge_sort invalid finish point");  // sanity check
         }
 
-        if ( start + midpoint >= len )
+        if (start + midpoint >= len)
         {
           // copy sorted remainder over
-          if ( copyvalid )
+          if (copyvalid)
           {
-            std::move( copyarr + start, copyarr + finish, begin + start );
+            std::move(copyarr + start, copyarr + finish, begin + start);
           }
           else
           {
-            std::move( begin + start, begin + finish, copyarr + start );
+            std::move(begin + start, begin + finish, copyarr + start);
           }
           break;  // skip merge if no second half exists
         }
 
-        if ( copyvalid )  // switch arrays per level of merging to avoid copying back to copyarr
+        if (copyvalid)  // switch arrays per level of merging to avoid copying
+                        // back to copyarr
         {
-          detail::merge_like_std( copyarr + start, copyarr + start + midpoint, copyarr + start + midpoint, copyarr + finish, begin + start, comp );
+          detail::merge_like_std(copyarr + start, copyarr + start + midpoint,
+                                 copyarr + start + midpoint, copyarr + finish,
+                                 begin + start, comp);
         }
         else
         {
-          detail::merge_like_std( begin + start, begin + start + midpoint, begin + start + midpoint, begin + finish, copyarr + start, comp );
+          detail::merge_like_std(begin + start, begin + start + midpoint,
+                                 begin + start + midpoint, begin + finish,
+                                 copyarr + start, comp);
         }
       }
 
-      copyvalid = !copyvalid; // switch arrays per level of merging to avoid copying back to copyarr
+      copyvalid = !copyvalid;  // switch arrays per level of merging to avoid
+                               // copying back to copyarr
     }
 
     // update copy if necessary
-    if ( copyvalid )
+    if (copyvalid)
     {
-      std::move( copyarr, copyarr + len, begin );
+      std::move(copyarr, copyarr + len, begin);
     }
   }
-  //else
+  // else
   //{
-      // Possible TBD: in-place mergesort
-      // Would shift (like insertion sort) when performing merge.
-      // PRO - Can use on GPU, O(1) storage required.
-      // CON - Shifting would cause slowdown O(n^2 log n).
+  //  Possible TBD: in-place mergesort
+  //  Would shift (like insertion sort) when performing merge.
+  //  PRO - Can use on GPU, O(1) storage required.
+  //  CON - Shifting would cause slowdown O(n^2 log n).
   //}
 }
 
@@ -682,12 +698,11 @@ merge_sort(Iter begin,
     \brief stable insertion sort given range inplace using comparison function
     and using O(N^2) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template<typename Container,
+         typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-insertion_sort(Container&& c,
-               Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    insertion_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -700,9 +715,11 @@ insertion_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::insertion_sort(begin_it, end_it, comp);
     }
   }
@@ -712,12 +729,11 @@ insertion_sort(Container&& c,
     \brief unstable shell sort given range inplace using comparison function
     and using O(N^?) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template<typename Container,
+         typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-shell_sort(Container&& c,
-           Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    shell_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -730,9 +746,11 @@ shell_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::shell_sort(begin_it, end_it, comp);
     }
   }
@@ -742,12 +760,11 @@ shell_sort(Container&& c,
     \brief unstable heap sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(1) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template<typename Container,
+         typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-heap_sort(Container&& c,
-          Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    heap_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -760,9 +777,11 @@ heap_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::heap_sort(begin_it, end_it, comp);
     }
   }
@@ -772,12 +791,11 @@ heap_sort(Container&& c,
     \brief unstable intro sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(lg(N)) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
+template<typename Container,
+         typename Compare = operators::less<detail::ContainerVal<Container>>>
 RAJA_HOST_DEVICE RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-intro_sort(Container&& c,
-           Compare comp = Compare{})
+    concepts::enable_if<type_traits::is_range<Container>>
+    intro_sort(Container&& c, Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -790,9 +808,11 @@ intro_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::intro_sort(begin_it, end_it, comp);
     }
   }
@@ -802,12 +822,11 @@ intro_sort(Container&& c,
     \brief stable merge sort given range inplace using comparison function
     and using O(N*lg(N)) comparisons and O(N) memory
 */
-template <typename Container,
-          typename Compare = operators::less<detail::ContainerVal<Container>>>
-RAJA_INLINE
-concepts::enable_if<type_traits::is_range<Container>>
-merge_sort(Container&& c,
-           Compare comp = Compare{})
+template<typename Container,
+         typename Compare = operators::less<detail::ContainerVal<Container>>>
+RAJA_INLINE concepts::enable_if<type_traits::is_range<Container>> merge_sort(
+    Container&& c,
+    Compare comp = Compare {})
 {
   using std::begin;
   using std::end;
@@ -820,9 +839,11 @@ merge_sort(Container&& c,
   auto begin_it = begin(c);
   auto end_it   = end(c);
 
-  if (begin_it != end_it) {
+  if (begin_it != end_it)
+  {
     auto next = begin_it;
-    if (++next != end_it) {
+    if (++next != end_it)
+    {
       detail::merge_sort(begin_it, end_it, comp);
     }
   }
diff --git a/include/RAJA/util/types.hpp b/include/RAJA/util/types.hpp
index 310217bde5..6a95542735 100644
--- a/include/RAJA/util/types.hpp
+++ b/include/RAJA/util/types.hpp
@@ -32,7 +32,6 @@
 
 #include "RAJA/util/macros.hpp"
 
-
 namespace RAJA
 {
 
@@ -41,7 +40,7 @@ namespace RAJA
 ///
 enum named_usage : int
 {
-  ignored = -1,
+  ignored     = -1,
   unspecified = 0
 };
 
@@ -70,13 +69,25 @@ enum struct kernel_sync_requirement : int
 namespace iteration_mapping
 {
 
-struct DirectBase {};
-struct LoopBase {};
-struct ContiguousLoopBase : LoopBase {};
-struct StridedLoopBase : LoopBase {};
-struct UnsizedLoopBase {};
-struct SizedLoopBase {};
-template < size_t t_max_iterations >
+struct DirectBase
+{};
+
+struct LoopBase
+{};
+
+struct ContiguousLoopBase : LoopBase
+{};
+
+struct StridedLoopBase : LoopBase
+{};
+
+struct UnsizedLoopBase
+{};
+
+struct SizedLoopBase
+{};
+
+template<size_t t_max_iterations>
 struct SizedLoopSpecifyingBase : SizedLoopBase
 {
   static constexpr size_t max_iterations = t_max_iterations;
@@ -103,7 +114,8 @@ struct SizedLoopSpecifyingBase : SizedLoopBase
 ///   // 3 -> {3}
 ///   // 4 -> {}
 ///
-struct Direct : DirectBase {};
+struct Direct : DirectBase
+{};
 
 ///
 /// Contiguousloop assumes the loop has fewer iterations than indices and
@@ -130,10 +142,13 @@ struct Direct : DirectBase {};
 ///   // 1 -> {3, 4, 5}
 ///   // 2 -> {6, 7}
 ///
-template < size_t max_iterations >
-struct Contiguousloop : ContiguousLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template<size_t max_iterations>
+struct Contiguousloop
+    : ContiguousLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
 ///
 /// StridedLoop assumes the loop has fewer iterations than indices and
@@ -160,18 +175,25 @@ struct Contiguousloop : ContiguousLoopBase,
 ///   // 1 -> {1, 4, 7}
 ///   // 2 -> {2, 5}
 ///
-template < size_t max_iterations >
-struct StridedLoop : StridedLoopBase,
-    std::conditional_t<(max_iterations != named_usage::unspecified),
-                       SizedLoopSpecifyingBase<max_iterations>, UnsizedLoopBase> {};
+template<size_t max_iterations>
+struct StridedLoop
+    : StridedLoopBase,
+      std::conditional_t<(max_iterations != named_usage::unspecified),
+                         SizedLoopSpecifyingBase<max_iterations>,
+                         UnsizedLoopBase>
+{};
 
-} // namespace iteration_mapping
+}  // namespace iteration_mapping
 
 ///
 /// Enumeration used to indicate whether ListSegment object owns data
 /// representing its indices.
 ///
-enum IndexOwnership { Unowned, Owned };
+enum IndexOwnership
+{
+  Unowned,
+  Owned
+};
 
 ///
 /// Type use for all loop indexing in RAJA constructs.
@@ -184,37 +206,34 @@ using Index_type = std::ptrdiff_t;
 ///
 const int UndefinedValue = -9999999;
 
-
 ///
 /// Template list of sizes
 ///
-template <Index_type... Sizes>
-struct SizeList {
-};
-
+template<Index_type... Sizes>
+struct SizeList
+{};
 
 ///
 /// Compile time fraction for use with integral types
 ///
-template <typename int_t, int_t numerator, int_t denominator>
+template<typename int_t, int_t numerator, int_t denominator>
 struct Fraction
 {
   static_assert(denominator != int_t(0), "denominator must not be zero");
 
   using inverse = Fraction<int_t, denominator, numerator>;
 
-  template < typename new_int_t >
-  using rebind = Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
+  template<typename new_int_t>
+  using rebind =
+      Fraction<new_int_t, new_int_t(numerator), new_int_t(denominator)>;
 
   static constexpr int_t multiply(int_t val) noexcept
   {
     return (val / denominator) * numerator +
            (val % denominator) * numerator / denominator;
   }
-
 };
 
-
 /*!
  ******************************************************************************
  *
@@ -254,7 +273,8 @@ using Complex_type = std::complex<Real_type>;
 // alignment attribute supported for versions > 12
 //
 #if __ICC >= 1300
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((align_value(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 #endif
@@ -262,7 +282,8 @@ using const_TDRAReal_ptr = const TDRAReal_ptr;
 #elif defined(RAJA_COMPILER_GNU)
 
 #elif defined(RAJA_COMPILER_CLANG)
-using TDRAReal_ptr = Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
+using TDRAReal_ptr =
+    Real_type* RAJA_RESTRICT __attribute__((aligned(RAJA::DATA_ALIGN)));
 
 using const_TDRAReal_ptr = const TDRAReal_ptr;
 
@@ -814,51 +835,51 @@ class RestrictComplexPtr
  ******************************************************************************
  */
 #if defined(RAJA_USE_BARE_PTR)
-using Real_ptr = Real_type*;
+using Real_ptr       = Real_type*;
 using const_Real_ptr = const Real_type*;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type*;
+using Complex_ptr       = Complex_type*;
 using const_Complex_ptr = const Complex_type*;
 #endif
 
-using UnalignedReal_ptr = Real_type*;
+using UnalignedReal_ptr       = Real_type*;
 using const_UnalignedReal_ptr = const Real_type*;
 
 #elif defined(RAJA_USE_RESTRICT_PTR)
-using Real_ptr = Real_type* RAJA_RESTRICT;
+using Real_ptr       = Real_type* RAJA_RESTRICT;
 using const_Real_ptr = const Real_type* RAJA_RESTRICT;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_RESTRICT_ALIGNED_PTR)
-using Real_ptr = TDRAReal_ptr;
+using Real_ptr       = TDRAReal_ptr;
 using const_Real_ptr = const_TDRAReal_ptr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = Complex_type* RAJA_RESTRICT;
+using Complex_ptr       = Complex_type* RAJA_RESTRICT;
 using const_Complex_ptr = const Complex_type* RAJA_RESTRICT;
 #endif
 
-using UnalignedReal_ptr = Real_type* RAJA_RESTRICT;
+using UnalignedReal_ptr       = Real_type* RAJA_RESTRICT;
 using const_UnalignedReal_ptr = const Real_type* RAJA_RESTRICT;
 
 #elif defined(RAJA_USE_PTR_CLASS)
-using Real_ptr = RestrictAlignedRealPtr;
+using Real_ptr       = RestrictAlignedRealPtr;
 using const_Real_ptr = ConstRestrictAlignedRealPtr;
 
 #if defined(RAJA_USE_COMPLEX)
-using Complex_ptr = RestrictComplexPtr;
+using Complex_ptr       = RestrictComplexPtr;
 using const_Complex_ptr = ConstRestrictComplexPtr;
 #endif
 
-using UnalignedReal_ptr = RestrictRealPtr;
+using UnalignedReal_ptr       = RestrictRealPtr;
 using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 
 #else
@@ -867,34 +888,34 @@ using const_UnalignedReal_ptr = ConstRestrictRealPtr;
 #endif
 
 
-namespace detail {
+namespace detail
+{
 
 /*!
  * \brief Abstracts access to memory using normal memory accesses.
  */
 struct DefaultAccessor
 {
-  template < typename T >
+  template<typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE T get(T* ptr, size_t i)
   {
     return ptr[i];
   }
 
-  template < typename T >
+  template<typename T>
   static RAJA_HOST_DEVICE RAJA_INLINE void set(T* ptr, size_t i, T val)
   {
     ptr[i] = val;
   }
 };
 
-
 /*!
  * \brief Abstracts T into an equal or greater size array of integers whose
  * size is between min_integer_type_size and max_interger_type_size inclusive.
  */
-template <typename T,
-          size_t min_integer_type_size = 1,
-          size_t max_integer_type_size = sizeof(unsigned long long)>
+template<typename T,
+         size_t min_integer_type_size = 1,
+         size_t max_integer_type_size = sizeof(unsigned long long)>
 struct AsIntegerArray
 {
   static_assert(min_integer_type_size <= max_integer_type_size,
@@ -919,11 +940,11 @@ struct AsIntegerArray
                     sizeof(unsigned short) <= max_integer_type_size) ||
                    sizeof(unsigned char) < min_integer_type_size),
                   unsigned short,
-                  std::conditional_t<
-                      ((alignof(T) >= alignof(unsigned char) &&
-                        sizeof(unsigned char) <= max_integer_type_size)),
-                      unsigned char,
-                      void>>>>>;
+                  std::conditional_t<((alignof(T) >= alignof(unsigned char) &&
+                                       sizeof(unsigned char) <=
+                                           max_integer_type_size)),
+                                     unsigned char,
+                                     void>>>>>;
   static_assert(!std::is_same<integer_type, void>::value,
                 "could not find a compatible integer type");
   static_assert(sizeof(integer_type) >= min_integer_type_size,
@@ -956,37 +977,33 @@ struct AsIntegerArray
   }
 };
 
-
 /*!
  * \brief Assign a new value to an object and restore the object's previous
  * value at the end of the current scope.
  */
-template <typename T>
+template<typename T>
 struct ScopedAssignment
 {
   ScopedAssignment(T& val, T const& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val),
+        m_prev_val(std::move(val))
   {
     m_ref_to_val = new_val;
   }
 
   ScopedAssignment(T& val, T&& new_val)
-    : m_ref_to_val(val)
-    , m_prev_val(std::move(val))
+      : m_ref_to_val(val),
+        m_prev_val(std::move(val))
   {
     m_ref_to_val = std::move(new_val);
   }
 
-  ScopedAssignment(ScopedAssignment const&) = delete;
-  ScopedAssignment(ScopedAssignment &&) = delete;
+  ScopedAssignment(ScopedAssignment const&)            = delete;
+  ScopedAssignment(ScopedAssignment&&)                 = delete;
   ScopedAssignment& operator=(ScopedAssignment const&) = delete;
-  ScopedAssignment& operator=(ScopedAssignment &&) = delete;
+  ScopedAssignment& operator=(ScopedAssignment&&)      = delete;
 
-  ~ScopedAssignment()
-  {
-    m_ref_to_val = std::move(m_prev_val);
-  }
+  ~ScopedAssignment() { m_ref_to_val = std::move(m_prev_val); }
 
 private:
   T& m_ref_to_val;
diff --git a/include/RAJA/util/zip.hpp b/include/RAJA/util/zip.hpp
index 1beefeb9cc..589863bf31 100644
--- a/include/RAJA/util/zip.hpp
+++ b/include/RAJA/util/zip.hpp
@@ -37,76 +37,80 @@ namespace RAJA
     \brief ZipIterator class for simultaneously iterating over
     multiple iterators. This is not a standards compliant iterator.
 */
-template < typename ... Iters >
+template<typename... Iters>
 struct ZipIterator
 {
-  static_assert(concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
+  static_assert(
+      concepts::all_of<type_traits::is_random_access_iterator<Iters>...>::value,
       "ZipIterator can only contain random access iterators");
   static_assert(sizeof...(Iters) > 1,
-      "ZipIterator must contain one or more iterators");
+                "ZipIterator must contain one or more iterators");
 
-  using value_type = zip_val<typename std::iterator_traits<Iters>::value_type...>;
+  using value_type =
+      zip_val<typename std::iterator_traits<Iters>::value_type...>;
   using difference_type = std::ptrdiff_t;
-  using pointer = void;
+  using pointer         = void;
   using reference = zip_ref<typename std::iterator_traits<Iters>::reference...>;
-  using creference = zip_ref<const typename std::iterator_traits<Iters>::reference...>;
+  using creference =
+      zip_ref<const typename std::iterator_traits<Iters>::reference...>;
   using iterator_category = std::random_access_iterator_tag;
 
-  RAJA_HOST_DEVICE inline ZipIterator()
-    : m_iterators()
-  {
-  }
+  RAJA_HOST_DEVICE inline ZipIterator() : m_iterators() {}
 
-  template < typename... Args,
-             typename = concepts::enable_if<type_traits::convertible_to<Args&&, Iters>...> >
+  template<typename... Args,
+           typename = concepts::enable_if<
+               type_traits::convertible_to<Args&&, Iters>...>>
   RAJA_HOST_DEVICE inline ZipIterator(Args&&... args)
-    : m_iterators(std::forward<Args>(args)...)
-  {
-  }
+      : m_iterators(std::forward<Args>(args)...)
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator(const ZipIterator& rhs)
-    : m_iterators(rhs.m_iterators)
-  {
-  }
+      : m_iterators(rhs.m_iterators)
+  {}
+
   RAJA_HOST_DEVICE inline ZipIterator(ZipIterator&& rhs)
-    : m_iterators(std::move(rhs.m_iterators))
-  {
-  }
+      : m_iterators(std::move(rhs.m_iterators))
+  {}
 
   RAJA_HOST_DEVICE inline ZipIterator& operator=(const ZipIterator& rhs)
   {
     m_iterators = rhs.m_iterators;
     return *this;
   }
+
   RAJA_HOST_DEVICE inline ZipIterator& operator=(ZipIterator&& rhs)
   {
     m_iterators = std::move(rhs.m_iterators);
     return *this;
   }
 
-
   RAJA_HOST_DEVICE inline difference_type get_stride() const { return 1; }
 
   RAJA_HOST_DEVICE inline bool operator==(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) == RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline bool operator!=(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) != RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline bool operator>(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) >  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) > RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline bool operator<(const ZipIterator& rhs) const
   {
-    return RAJA::get<0>(m_iterators) <  RAJA::get<0>(rhs.m_iterators);
+    return RAJA::get<0>(m_iterators) < RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline bool operator>=(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) >= RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline bool operator<=(const ZipIterator& rhs) const
   {
     return RAJA::get<0>(m_iterators) <= RAJA::get<0>(rhs.m_iterators);
@@ -114,20 +118,23 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline ZipIterator& operator++()
   {
-    detail::zip_for_each(m_iterators, detail::PreInc{});
+    detail::zip_for_each(m_iterators, detail::PreInc {});
     return *this;
   }
+
   RAJA_HOST_DEVICE inline ZipIterator& operator--()
   {
-    detail::zip_for_each(m_iterators, detail::PreDec{});
+    detail::zip_for_each(m_iterators, detail::PreDec {});
     return *this;
   }
+
   RAJA_HOST_DEVICE inline ZipIterator operator++(int)
   {
     ZipIterator tmp(*this);
     ++(*this);
     return tmp;
   }
+
   RAJA_HOST_DEVICE inline ZipIterator operator--(int)
   {
     ZipIterator tmp(*this);
@@ -135,16 +142,15 @@ struct ZipIterator
     return tmp;
   }
 
-  RAJA_HOST_DEVICE inline ZipIterator& operator+=(
-      const difference_type& rhs)
+  RAJA_HOST_DEVICE inline ZipIterator& operator+=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::PlusEq<difference_type> {rhs});
     return *this;
   }
-  RAJA_HOST_DEVICE inline ZipIterator& operator-=(
-      const difference_type& rhs)
+
+  RAJA_HOST_DEVICE inline ZipIterator& operator-=(const difference_type& rhs)
   {
-    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type>{rhs});
+    detail::zip_for_each(m_iterators, detail::MinusEq<difference_type> {rhs});
     return *this;
   }
 
@@ -153,6 +159,7 @@ struct ZipIterator
   {
     return RAJA::get<0>(m_iterators) - RAJA::get<0>(rhs.m_iterators);
   }
+
   RAJA_HOST_DEVICE inline ZipIterator operator+(
       const difference_type& rhs) const
   {
@@ -160,6 +167,7 @@ struct ZipIterator
     tmp += rhs;
     return tmp;
   }
+
   RAJA_HOST_DEVICE inline ZipIterator operator-(
       const difference_type& rhs) const
   {
@@ -167,9 +175,9 @@ struct ZipIterator
     tmp -= rhs;
     return tmp;
   }
-  RAJA_HOST_DEVICE friend ZipIterator operator+(
-      difference_type lhs,
-      const ZipIterator& rhs)
+
+  RAJA_HOST_DEVICE friend ZipIterator operator+(difference_type lhs,
+                                                const ZipIterator& rhs)
   {
     ZipIterator tmp(rhs);
     tmp += lhs;
@@ -178,8 +186,9 @@ struct ZipIterator
 
   RAJA_HOST_DEVICE inline reference operator*() const
   {
-    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)>{});
+    return deref_helper(camp::make_idx_seq_t<sizeof...(Iters)> {});
   }
+
   // TODO:: figure out what to do with this
   // RAJA_HOST_DEVICE inline reference operator->() const
   // {
@@ -190,30 +199,28 @@ struct ZipIterator
     return *((*this) + rhs);
   }
 
-  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs, ZipIterator rhs)
+  RAJA_HOST_DEVICE friend inline void safe_iter_swap(ZipIterator lhs,
+                                                     ZipIterator rhs)
   {
-    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap{});
+    detail::zip_for_each(lhs.m_iterators, rhs.m_iterators, detail::IterSwap {});
   }
 
 private:
   zip_val<camp::decay<Iters>...> m_iterators;
 
-  template < camp::idx_t ... Is >
+  template<camp::idx_t... Is>
   RAJA_HOST_DEVICE inline reference deref_helper(camp::idx_seq<Is...>) const
   {
     return reference(*RAJA::get<Is>(m_iterators)...);
   }
 };
 
-
 /*!
     \brief Zip multiple iterators together to iterate them simultaneously with
     a single ZipIterator object.
 */
-template < typename... Args >
-RAJA_HOST_DEVICE
-auto zip(Args&&... args)
-  -> ZipIterator<camp::decay<Args>...>
+template<typename... Args>
+RAJA_HOST_DEVICE auto zip(Args&&... args) -> ZipIterator<camp::decay<Args>...>
 {
   return {std::forward<Args>(args)...};
 }
@@ -222,30 +229,29 @@ auto zip(Args&&... args)
     \brief Zip multiple containers together to iterate them simultaneously with
     ZipIterator objects.
 */
-template <typename... Args>
-RAJA_HOST_DEVICE RAJA_INLINE
-auto zip_span(Args&&... args)
-  -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-          typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>
+template<typename... Args>
+RAJA_HOST_DEVICE RAJA_INLINE auto zip_span(Args&&... args)
+    -> Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
+            typename ZipIterator<
+                detail::ContainerIter<camp::decay<Args>>...>::difference_type>
 {
   using std::begin;
   using std::end;
   return Span<ZipIterator<detail::ContainerIter<camp::decay<Args>>...>,
-              typename ZipIterator<detail::ContainerIter<camp::decay<Args>>...>::difference_type>(
+              typename ZipIterator<detail::ContainerIter<
+                  camp::decay<Args>>...>::difference_type>(
       zip(begin(std::forward<Args>(args))...),
-      zip(  end(std::forward<Args>(args))...));
+      zip(end(std::forward<Args>(args))...));
 }
 
 /*!
     \brief Comparator object that compares the first member
     of tuple like objects.
 */
-template < typename T, typename Compare >
+template<typename T, typename Compare>
 struct CompareFirst
 {
-  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_)
-    : comp(comp_)
-  { }
+  RAJA_HOST_DEVICE inline CompareFirst(Compare comp_) : comp(comp_) {}
 
   RAJA_HOST_DEVICE inline bool operator()(T const& lhs, T const& rhs)
   {
@@ -260,10 +266,8 @@ struct CompareFirst
     \brief Make a comparator to compare first member of tuple
     like objects of type T.
 */
-template < typename T, typename Compare >
-RAJA_HOST_DEVICE
-auto compare_first(Compare comp)
-  -> CompareFirst<T, Compare>
+template<typename T, typename Compare>
+RAJA_HOST_DEVICE auto compare_first(Compare comp) -> CompareFirst<T, Compare>
 {
   return {comp};
 }
diff --git a/include/RAJA/util/zip_tuple.hpp b/include/RAJA/util/zip_tuple.hpp
index d631d4714b..458b5cbcf9 100644
--- a/include/RAJA/util/zip_tuple.hpp
+++ b/include/RAJA/util/zip_tuple.hpp
@@ -31,49 +31,63 @@
 namespace RAJA
 {
 
-template < bool is_val, typename ... Ts >
+template<bool is_val, typename... Ts>
 struct zip_tuple;
 
-template < camp::idx_t I, typename ZT >
+template<camp::idx_t I, typename ZT>
 struct zip_tuple_element;
 
-template < camp::idx_t I, bool is_val, typename ... Ts >
+template<camp::idx_t I, bool is_val, typename... Ts>
 struct zip_tuple_element<I, zip_tuple<is_val, Ts...>>
-  : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
-{ };
+    : camp::tuple_element<I, typename zip_tuple<is_val, Ts...>::value_type>
+{};
 
-template < camp::idx_t I, typename ZT >
+template<camp::idx_t I, typename ZT>
 using zip_tuple_element_t = typename zip_tuple_element<I, ZT>::type;
 
-
 // get function declarations for zip_tuple
 // the reference type returned by get depends on the reference type
 // of the zip_tuple that get is called on
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> &
-get(zip_tuple<is_val, Ts...>      &  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr                         RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
-get(zip_tuple<is_val, Ts...> const&  z) noexcept
-{ return           z .template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> &&
-get(zip_tuple<is_val, Ts...>      && z) noexcept
-{ return std::move(z).template get<I>(); }
-template < camp::idx_t I, bool is_val, typename ... Ts >
-RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
+template<camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::zip_tuple_element_t<I,
+                                                     zip_tuple<is_val, Ts...>>&
+get(zip_tuple<is_val, Ts...>& z) noexcept
+{
+  return z.template get<I>();
+}
+
+template<camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr RAJA::
+    zip_tuple_element_t<I, zip_tuple<is_val, Ts...>> const&
+    get(zip_tuple<is_val, Ts...> const& z) noexcept
+{
+  return z.template get<I>();
+}
+
+template<camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>>&&
+get(zip_tuple<is_val, Ts...>&& z) noexcept
+{
+  return std::move(z).template get<I>();
+}
+
+template<camp::idx_t I, bool is_val, typename... Ts>
+RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+    RAJA::zip_tuple_element_t<I, zip_tuple<is_val, Ts...>>> const&&
 get(zip_tuple<is_val, Ts...> const&& z) noexcept
-{ return std::move(z).template get<I>(); }
+{
+  return std::move(z).template get<I>();
+}
 
 namespace detail
 {
 
 struct PassThrough
 {
-  template < typename T >
+  template<typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::forward<T>(t))
+      -> decltype(std::forward<T>(t))
   {
     return std::forward<T>(t);
   }
@@ -81,9 +95,9 @@ struct PassThrough
 
 struct Move
 {
-  template < typename T >
+  template<typename T>
   RAJA_HOST_DEVICE RAJA_INLINE auto operator()(T&& t) const
-    -> decltype(std::move(t))
+      -> decltype(std::move(t))
   {
     return std::move(t);
   }
@@ -91,9 +105,9 @@ struct Move
 
 struct PreInc
 {
-  template< typename Iter >
+  template<typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(++std::forward<Iter>(iter))
+      -> decltype(++std::forward<Iter>(iter))
   {
     return ++std::forward<Iter>(iter);
   }
@@ -101,33 +115,35 @@ struct PreInc
 
 struct PreDec
 {
-  template< typename Iter >
+  template<typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(--std::forward<Iter>(iter))
+      -> decltype(--std::forward<Iter>(iter))
   {
     return --std::forward<Iter>(iter);
   }
 };
 
-template < typename difference_type >
+template<typename difference_type>
 struct PlusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+
+  template<typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) += rhs)
+      -> decltype(std::forward<Iter>(iter) += rhs)
   {
     return std::forward<Iter>(iter) += rhs;
   }
 };
 
-template < typename difference_type >
+template<typename difference_type>
 struct MinusEq
 {
   const difference_type& rhs;
-  template< typename Iter >
+
+  template<typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(std::forward<Iter>(iter) -= rhs)
+      -> decltype(std::forward<Iter>(iter) -= rhs)
   {
     return std::forward<Iter>(iter) -= rhs;
   }
@@ -135,9 +151,9 @@ struct MinusEq
 
 struct DeRef
 {
-  template< typename Iter >
+  template<typename Iter>
   RAJA_HOST_DEVICE inline auto operator()(Iter&& iter) const
-    -> decltype(*std::forward<Iter>(iter))
+      -> decltype(*std::forward<Iter>(iter))
   {
     return *std::forward<Iter>(iter);
   }
@@ -145,7 +161,7 @@ struct DeRef
 
 struct Swap
 {
-  template< typename T0, typename T1 >
+  template<typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using camp::safe_swap;
@@ -156,7 +172,7 @@ struct Swap
 
 struct IterSwap
 {
-  template< typename T0, typename T1 >
+  template<typename T0, typename T1>
   RAJA_HOST_DEVICE inline int operator()(T0&& t0, T1&& t1) const
   {
     using RAJA::safe_iter_swap;
@@ -165,13 +181,13 @@ struct IterSwap
   }
 };
 
-
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
+template<typename Tuple, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple&& t,
+                                               F&& f,
+                                               camp::idx_seq<Is...>)
 {
   camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple>(t)))...);
 }
@@ -179,51 +195,57 @@ void zip_for_each_impl(Tuple&& t, F&& f, camp::idx_seq<Is...>)
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is >
-RAJA_HOST_DEVICE inline
-void zip_for_each_impl(Tuple0&& t0, Tuple1&& t1, F&& f, camp::idx_seq<Is...>)
+template<typename Tuple0, typename Tuple1, typename F, camp::idx_t... Is>
+RAJA_HOST_DEVICE inline void zip_for_each_impl(Tuple0&& t0,
+                                               Tuple1&& t1,
+                                               F&& f,
+                                               camp::idx_seq<Is...>)
 {
-  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)), RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
+  camp::sink(std::forward<F>(f)(RAJA::get<Is>(std::forward<Tuple0>(t0)),
+                                RAJA::get<Is>(std::forward<Tuple1>(t1)))...);
 }
 
 /*!
     \brief Call f on each member of t (f(t)...).
 */
-template < typename Tuple, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple&& t, F&& f)
+template<typename Tuple, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple&& t, F&& f)
 {
-  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f), typename camp::decay<Tuple>::IdxSeq{});
+  zip_for_each_impl(std::forward<Tuple>(t), std::forward<F>(f),
+                    typename camp::decay<Tuple>::IdxSeq {});
 }
 
 /*!
     \brief Call f on each member of t0 and t1 (f(t0, t1)...).
 */
-template < typename Tuple0, typename Tuple1, typename F >
-RAJA_HOST_DEVICE inline
-void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
+template<typename Tuple0, typename Tuple1, typename F>
+RAJA_HOST_DEVICE inline void zip_for_each(Tuple0&& t0, Tuple1&& t1, F&& f)
 {
-  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq, typename camp::decay<Tuple1>::IdxSeq>::value,
-      "Tuple0 and Tuple1 must have the same size");
-  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1), std::forward<F>(f), typename camp::decay<Tuple0>::IdxSeq{});
+  static_assert(std::is_same<typename camp::decay<Tuple0>::IdxSeq,
+                             typename camp::decay<Tuple1>::IdxSeq>::value,
+                "Tuple0 and Tuple1 must have the same size");
+  zip_for_each_impl(std::forward<Tuple0>(t0), std::forward<Tuple1>(t1),
+                    std::forward<F>(f),
+                    typename camp::decay<Tuple0>::IdxSeq {});
 }
 
-} // end namespace detail
+}  // end namespace detail
 
 /*!
     \brief Tuple used by ZipIterator for storing multiple references and values.
-    Acts like a reference to its members allowing copy/move construction/assignment
-    based on the reference type of the zip_tuple.
+    Acts like a reference to its members allowing copy/move
+   construction/assignment based on the reference type of the zip_tuple.
 */
-template < bool is_val, typename ... Ts >
+template<bool is_val, typename... Ts>
 struct zip_tuple
 {
   using value_type = RAJA::tuple<Ts...>;
 
-  template < typename T >
-  using opp_type = typename std::conditional< is_val,
-        typename std::add_lvalue_reference<T>::type,
-        typename std::remove_reference<T>::type >::type;
+  template<typename T>
+  using opp_type =
+      typename std::conditional<is_val,
+                                typename std::add_lvalue_reference<T>::type,
+                                typename std::remove_reference<T>::type>::type;
 
   // zip_tuple type with opposite is_val
   using opp_tuple = zip_tuple<!is_val, opp_type<Ts>...>;
@@ -232,76 +254,125 @@ struct zip_tuple
   using IdxSeq = camp::make_idx_seq_t<sizeof...(Ts)>;
 
   // constructor from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...> >
+  template<
+      typename... Os,
+      typename = concepts::enable_if<type_traits::convertible_to<Os&&, Ts>...>>
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(Os&&... os)
-    : m_tuple(std::forward<Os>(os)...) { }
+      : m_tuple(std::forward<Os>(os)...)
+  {}
 
   // assignment from types convertible to Ts
-  template < typename ... Os
-           , typename = concepts::enable_if<type_traits::convertible_to<Os&&, typename std::remove_reference<Ts>::type>...> >
+  template<typename... Os,
+           typename = concepts::enable_if<type_traits::convertible_to<
+               Os&&,
+               typename std::remove_reference<Ts>::type>...>>
   zip_tuple& assign(Os&&... os)
-  { return assign_helper(IdxSeq{}, std::forward<Os>(os)...); }
+  {
+    return assign_helper(IdxSeq {}, std::forward<Os>(os)...);
+  }
 
   // copy and move constructors
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
+
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(zip_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // copy and move constructors from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o)
-    : zip_tuple(          o , IdxSeq{}) { }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o) : zip_tuple(o, IdxSeq {})
+  {}
+
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o)
-    : zip_tuple(          o , IdxSeq{}) { }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o)
-    : zip_tuple(std::move(o), IdxSeq{}) { } // move if is_val, pass-through otherwise
+      : zip_tuple(o, IdxSeq {})
+  {}
+
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o)
+      : zip_tuple(std::move(o), IdxSeq {})
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operators from opp_tuple type zip_tuples
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &      o)
-  { return assign_helper(          o , IdxSeq{}); }
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple& o)
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+
   RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple const& o)
-  { return assign_helper(          o , IdxSeq{}); }
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple &&     o)
-  { return assign_helper(std::move(o), IdxSeq{}); }
+  {
+    return assign_helper(o, IdxSeq {});
+  }
+
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& operator=(opp_tuple&& o)
+  {
+    return assign_helper(std::move(o), IdxSeq {});
+  }
 
   // get member functions for zip_tuples
   // the reference type returned by get depends on the reference type
   // of the zip_tuple that get is called on
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> & get() & noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr                         RAJA::tuple_element_t<I, value_type> const& get() const& noexcept
-  { return RAJA::get<I>(m_tuple); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> && get() && noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
-  template < camp::idx_t I >
-  RAJA_HOST_DEVICE constexpr std::remove_reference_t<RAJA::tuple_element_t<I, value_type>> const&& get() const&& noexcept
-  { return std::move(RAJA::get<I>(m_tuple)); }
+  template<camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type>&
+  get() & noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+
+  template<camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr RAJA::tuple_element_t<I, value_type> const& get()
+      const& noexcept
+  {
+    return RAJA::get<I>(m_tuple);
+  }
+
+  template<camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>>&&
+  get() && noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
+
+  template<camp::idx_t I>
+  RAJA_HOST_DEVICE constexpr std::remove_reference_t<
+      RAJA::tuple_element_t<I, value_type>> const&&
+  get() const&& noexcept
+  {
+    return std::move(RAJA::get<I>(m_tuple));
+  }
 
   // safe_swap that calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, zip_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     zip_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // safe_swap for swapping zip_tuples with opposite is_val
   // calls swap on each pair in the tuple
-  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs, opp_tuple& rhs)
+  RAJA_HOST_DEVICE friend RAJA_INLINE void safe_swap(zip_tuple& lhs,
+                                                     opp_tuple& rhs)
   {
-    detail::zip_for_each(lhs, rhs, detail::Swap{});
+    detail::zip_for_each(lhs, rhs, detail::Swap {});
   }
 
   // allow printing of zip_tuples by printing value_type
@@ -313,67 +384,119 @@ struct zip_tuple
 private:
   // move if is_val is true, otherwise copy in move constructor
   // this allows values to be moved, and references to stay lvalue references
-  using IsValMover = typename std::conditional<is_val, detail::Move, detail::PassThrough>::type;
+  using IsValMover = typename std::
+      conditional<is_val, detail::Move, detail::PassThrough>::type;
 
   value_type m_tuple;
 
   // assignment helper from types convertible to Ts
-  template < typename ... Os, camp::idx_t ... Is >
+  template<typename... Os, camp::idx_t... Is>
   zip_tuple& assign_helper(camp::idx_seq<Is...>, Os&&... os)
-  { camp::sink(get<Is>() = std::forward<Os>(os)...); return *this; }
+  {
+    camp::sink(get<Is>() = std::forward<Os>(os)...);
+    return *this;
+  }
 
   // copy and move constructor helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(zip_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &      o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); } return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple &&     o, camp::idx_seq<Is...>)
-  { if (this != &o) { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); } return *this; }
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    }
+    return *this;
+  }
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(zip_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    if (this != &o)
+    {
+      camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    }
+    return *this;
+  }
 
   // copy and move constructor helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &      o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(             o )...) { }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple &&     o, camp::idx_seq<Is...>)
-    : zip_tuple(RAJA::get<Is>(IsValMover{}(o))...) { } // move if is_val, pass-through otherwise
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple const& o,
+                                         camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(o)...)
+  {}
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple(opp_tuple&& o, camp::idx_seq<Is...>)
+      : zip_tuple(RAJA::get<Is>(IsValMover {}(o))...)
+  {}  // move if is_val, pass-through otherwise
 
   // copy and move assignment operator helpers from opp_tuple type zip_tuples
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &      o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(          o )...); return *this; }
-  template < camp::idx_t ... Is >
-  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple &&     o, camp::idx_seq<Is...>)
-  { camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...); return *this; }
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
+
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple const& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(o)...);
+    return *this;
+  }
 
+  template<camp::idx_t... Is>
+  RAJA_HOST_DEVICE RAJA_INLINE zip_tuple& assign_helper(opp_tuple&& o,
+                                                        camp::idx_seq<Is...>)
+  {
+    camp::sink(get<Is>() = RAJA::get<Is>(std::move(o))...);
+    return *this;
+  }
 };
 
 // alias zip_ref to zip_tuple capable of storing references (!is_val)
-template < typename ... Ts >
+template<typename... Ts>
 using zip_ref = zip_tuple<false, Ts...>;
 
 // alias zip_val to zip_tuple suitable for storing values (is_val)
-template < typename ... Ts >
+template<typename... Ts>
 using zip_val = zip_tuple<true, Ts...>;
 
 }  // end namespace RAJA
diff --git a/src/AlignedRangeIndexSetBuilders.cpp b/src/AlignedRangeIndexSetBuilders.cpp
index d95859d71d..1fc5b37f27 100644
--- a/src/AlignedRangeIndexSetBuilders.cpp
+++ b/src/AlignedRangeIndexSetBuilders.cpp
@@ -51,7 +51,8 @@ void buildIndexSetAligned(
   if (length == 0) return;
 
   /* only transform relatively large */
-  if (length > range_min_length) {
+  if (length > range_min_length)
+  {
     /* build a rindex array from an index array */
     RAJA::Index_type docount = 0;
     RAJA::Index_type inrange = -1;
@@ -60,30 +61,41 @@ void buildIndexSetAligned(
     /* first, gather statistics */
     /****************************/
 
-    RAJA::Index_type scanVal = indices_in[0];
+    RAJA::Index_type scanVal    = indices_in[0];
     RAJA::Index_type sliceCount = 0;
-    for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+    for (RAJA::Index_type ii = 1; ii < length; ++ii)
+    {
       RAJA::Index_type lookAhead = indices_in[ii];
 
-      if (inrange == -1) {
-        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+      if (inrange == -1)
+      {
+        if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+        {
           inrange = 1;
-        } else {
+        }
+        else
+        {
           inrange = 0;
         }
       }
 
-      if (lookAhead == scanVal + 1) {
-        if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-          if (sliceCount != 0) {
+      if (lookAhead == scanVal + 1)
+      {
+        if ((inrange == 0) && ((scanVal % range_align) == 0))
+        {
+          if (sliceCount != 0)
+          {
             docount += 1 + sliceCount; /* length + singletons */
           }
-          inrange = 1;
+          inrange    = 1;
           sliceCount = 0;
         }
         ++sliceCount; /* account for scanVal */
-      } else {
-        if (inrange == 1) {
+      }
+      else
+      {
+        if (inrange == 1)
+        {
           /* we can tighten this up by schleping any trailing */
           /* sigletons off into the subsequent singleton */
           /* array.  We would then also need to recheck the */
@@ -93,9 +105,11 @@ void buildIndexSetAligned(
           /* a range array */
           ++sliceCount;
           docount += 2; /* length + begin */
-          inrange = 0;
+          inrange    = 0;
           sliceCount = 0;
-        } else {
+        }
+        else
+        {
           ++sliceCount; /* account for scanVal */
         }
       }
@@ -103,22 +117,29 @@ void buildIndexSetAligned(
       scanVal = lookAhead;
     }  // end loop to gather statistics
 
-    if (inrange != -1) {
-      if (inrange) {
+    if (inrange != -1)
+    {
+      if (inrange)
+      {
         ++sliceCount;
         docount += 2; /* length + begin */
-      } else {
+      }
+      else
+      {
         ++sliceCount;
         docount += 1 + sliceCount; /* length + singletons */
       }
-    } else if (scanVal != -1) {
+    }
+    else if (scanVal != -1)
+    {
       ++sliceCount;
       docount += 2;
     }
     ++docount; /* zero length termination */
 
     /* What is the cutoff criteria for generating the rindex array? */
-    if (docount < (length * (range_align - 1)) / range_align) {
+    if (docount < (length * (range_align - 1)) / range_align)
+    {
       /* The rindex array can either contain a pointer into the */
       /* original index array, *or* it can repack the data from the */
       /* original index array.  Benefits of repacking could include */
@@ -132,33 +153,44 @@ void buildIndexSetAligned(
       RAJA::Index_type dobegin;
       inrange = -1;
 
-      scanVal = indices_in[0];
+      scanVal    = indices_in[0];
       sliceCount = 0;
-      dobegin = scanVal;
-      for (RAJA::Index_type ii = 1; ii < length; ++ii) {
+      dobegin    = scanVal;
+      for (RAJA::Index_type ii = 1; ii < length; ++ii)
+      {
         RAJA::Index_type lookAhead = indices_in[ii];
 
-        if (inrange == -1) {
-          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0)) {
+        if (inrange == -1)
+        {
+          if ((lookAhead == scanVal + 1) && ((scanVal % range_align) == 0))
+          {
             inrange = 1;
-          } else {
+          }
+          else
+          {
             inrange = 0;
             dobegin = ii - 1;
           }
         }
-        if (lookAhead == scanVal + 1) {
-          if ((inrange == 0) && ((scanVal % range_align) == 0)) {
-            if (sliceCount != 0) {
-              iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                          work_res));
+        if (lookAhead == scanVal + 1)
+        {
+          if ((inrange == 0) && ((scanVal % range_align) == 0))
+          {
+            if (sliceCount != 0)
+            {
+              iset.push_back(
+                  ListSegment(&indices_in[dobegin], sliceCount, work_res));
             }
-            inrange = 1;
-            dobegin = scanVal;
+            inrange    = 1;
+            dobegin    = scanVal;
             sliceCount = 0;
           }
           ++sliceCount; /* account for scanVal */
-        } else {
-          if (inrange == 1) {
+        }
+        else
+        {
+          if (inrange == 1)
+          {
             /* we can tighten this up by schleping any trailing */
             /* sigletons off into the subsequent singleton */
             /* array.  We would then also need to recheck the */
@@ -168,10 +200,12 @@ void buildIndexSetAligned(
             /* a range array */
             ++sliceCount;
             iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-            inrange = 0;
+            inrange    = 0;
             sliceCount = 0;
-            dobegin = ii;
-          } else {
+            dobegin    = ii;
+          }
+          else
+          {
             ++sliceCount; /* account for scanVal */
           }
         }
@@ -179,22 +213,32 @@ void buildIndexSetAligned(
         scanVal = lookAhead;
       }  // for (RAJA::Index_type ii ...
 
-      if (inrange != -1) {
-        if (inrange) {
+      if (inrange != -1)
+      {
+        if (inrange)
+        {
           ++sliceCount;
           iset.push_back(RangeSegment(dobegin, dobegin + sliceCount));
-        } else {
+        }
+        else
+        {
           ++sliceCount;
-          iset.push_back(ListSegment(&indices_in[dobegin], sliceCount,
-                                      work_res));
+          iset.push_back(
+              ListSegment(&indices_in[dobegin], sliceCount, work_res));
         }
-      } else if (scanVal != -1) {
+      }
+      else if (scanVal != -1)
+      {
         iset.push_back(ListSegment(&scanVal, 1, work_res));
       }
-    } else {  // !(docount < (length*range_align-1))/range_align)
+    }
+    else
+    {  // !(docount < (length*range_align-1))/range_align)
       iset.push_back(ListSegment(indices_in, length, work_res));
     }
-  } else {  // else !(length > range_min_length)
+  }
+  else
+  {  // else !(length > range_min_length)
     iset.push_back(ListSegment(indices_in, length, work_res));
   }
 }
diff --git a/src/DepGraphNode.cpp b/src/DepGraphNode.cpp
index 176d9e855d..df994ce396 100644
--- a/src/DepGraphNode.cpp
+++ b/src/DepGraphNode.cpp
@@ -29,9 +29,11 @@ void DepGraphNode::print(std::ostream& os) const
      << m_semaphore_reload_value << std::endl;
 
   os << "     num dep tasks = " << m_num_dep_tasks;
-  if (m_num_dep_tasks > 0) {
+  if (m_num_dep_tasks > 0)
+  {
     os << " ( ";
-    for (int jj = 0; jj < m_num_dep_tasks; ++jj) {
+    for (int jj = 0; jj < m_num_dep_tasks; ++jj)
+    {
       os << m_dep_task[jj] << "  ";
     }
     os << " )";
diff --git a/src/KokkosPluginLoader.cpp b/src/KokkosPluginLoader.cpp
index fa05e0faf8..daa29681ad 100644
--- a/src/KokkosPluginLoader.cpp
+++ b/src/KokkosPluginLoader.cpp
@@ -15,43 +15,45 @@
 const uint64_t kokkos_interface_version = 20171029;
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
 template<typename function>
-RAJA_INLINE
-void
-getFunction(void* plugin, std::vector<function>& functions, const char* fname)
+RAJA_INLINE void getFunction(void* plugin,
+                             std::vector<function>& functions,
+                             const char* fname)
 {
-  #ifndef _WIN32
-  function func = (function) dlsym(plugin, fname);
+#ifndef _WIN32
+  function func = (function)dlsym(plugin, fname);
   if (func)
     functions.push_back(func);
   else
     printf("[KokkosPluginLoader]: dlsym failed: %s\n", dlerror());
-  #else
+#else
   RAJA_UNUSED_ARG(plugin);
   RAJA_UNUSED_ARG(functions);
   RAJA_UNUSED_ARG(fname);
-  #endif
+#endif
 }
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 KokkosPluginLoader::KokkosPluginLoader()
 {
-  char *env = getenv("KOKKOS_PLUGINS");
+  char* env = getenv("KOKKOS_PLUGINS");
   if (env == nullptr)
   {
     return;
   }
   initDirectory(std::string(env));
 
-  for (auto &func : init_functions)
+  for (auto& func : init_functions)
   {
     func(0, kokkos_interface_version, 0, nullptr);
   }
@@ -59,7 +61,7 @@ KokkosPluginLoader::KokkosPluginLoader()
 
 void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : pre_functions)
+  for (auto& func : pre_functions)
   {
     func("", 0, &(p.kID));
   }
@@ -67,7 +69,7 @@ void KokkosPluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &func : post_functions)
+  for (auto& func : post_functions)
   {
     func(p.kID);
   }
@@ -75,7 +77,7 @@ void KokkosPluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void KokkosPluginLoader::finalize()
 {
-  for (auto &func : finalize_functions)
+  for (auto& func : finalize_functions)
   {
     func();
   }
@@ -86,10 +88,10 @@ void KokkosPluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void KokkosPluginLoader::initPlugin(const std::string &path)
+void KokkosPluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[KokkosPluginLoader]: dlopen failed: %s\n", dlerror());
@@ -98,28 +100,31 @@ void KokkosPluginLoader::initPlugin(const std::string &path)
   // Getting and storing supported kokkos functions.
   getFunction<init_function>(plugin, init_functions, "kokkosp_init_library");
 
-  getFunction<pre_function>(plugin, pre_functions, "kokkosp_begin_parallel_for");
+  getFunction<pre_function>(plugin, pre_functions,
+                            "kokkosp_begin_parallel_for");
 
-  getFunction<post_function>(plugin, post_functions, "kokkosp_end_parallel_for");
+  getFunction<post_function>(plugin, post_functions,
+                             "kokkosp_end_parallel_for");
 
-  getFunction<finalize_function>(plugin, finalize_functions, "kokkosp_finalize_library");
-  #else
+  getFunction<finalize_function>(plugin, finalize_functions,
+                                 "kokkosp_finalize_library");
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void KokkosPluginLoader::initDirectory(const std::string &path)
+void KokkosPluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -136,14 +141,16 @@ void KokkosPluginLoader::initDirectory(const std::string &path)
   {
     perror("[KokkosPluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkKokkosPluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P("KokkosPluginLoader", "Dynamically load plugins ported from the Kokkos library.");
+static RAJA::util::PluginRegistry::add<RAJA::util::KokkosPluginLoader> P(
+    "KokkosPluginLoader",
+    "Dynamically load plugins ported from the Kokkos library.");
diff --git a/src/LockFreeIndexSetBuilders.cpp b/src/LockFreeIndexSetBuilders.cpp
index f9ef1f51c8..3b5f314138 100644
--- a/src/LockFreeIndexSetBuilders.cpp
+++ b/src/LockFreeIndexSetBuilders.cpp
@@ -38,15 +38,14 @@ namespace RAJA
  ******************************************************************************
  *
  * Generate a lock-free "block" index set (planar division) containing
- * range segments. 
+ * range segments.
  *
  ******************************************************************************
  */
-void buildLockFreeBlockIndexset(
-    RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
-    int fastDim,
-    int midDim,
-    int slowDim)
+void buildLockFreeBlockIndexset(RAJA::TypedIndexSet<RAJA::RangeSegment>& iset,
+                                int fastDim,
+                                int midDim,
+                                int slowDim)
 {
   constexpr int PROFITABLE_ENTITY_THRESHOLD_BLOCK = 100;
 
@@ -56,10 +55,13 @@ void buildLockFreeBlockIndexset(
 
   if ((midDim | slowDim) == 0) /* 1d mesh */
   {
-    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1) {
+    if (fastDim / PROFITABLE_ENTITY_THRESHOLD_BLOCK <= 1)
+    {
       // printf("%d %d\n", 0, fastDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -68,22 +70,28 @@ void buildLockFreeBlockIndexset(
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
       int numSegments = numThreads * 3;
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = lane; i < numSegments; i += 3) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = lane; i < numSegments; i += 3)
+        {
           RAJA::Index_type start = i * fastDim / numSegments;
-          RAJA::Index_type end = (i + 1) * fastDim / numSegments;
+          RAJA::Index_type end   = (i + 1) * fastDim / numSegments;
           // printf("%d %d\n", start, end) ;
           iset.push_back(RAJA::RangeSegment(start, end));
         }
       }
     }
-  } else if (slowDim == 0) /* 2d mesh */
+  }
+  else if (slowDim == 0) /* 2d mesh */
   {
     int rowsPerSegment = midDim / (3 * numThreads);
-    if (rowsPerSegment == 0) {
+    if (rowsPerSegment == 0)
+    {
       // printf("%d %d\n", 0, fastDim*midDim) ;
       iset.push_back(RAJA::RangeSegment(0, fastDim * midDim));
-    } else {
+    }
+    else
+    {
       /* This just sets up the schedule -- a truly safe */
       /* execution of this schedule would require a check */
       /* for completion of dependent threads before execution. */
@@ -91,13 +99,15 @@ void buildLockFreeBlockIndexset(
       /* We might want to force one thread if the */
       /* profitability ratio is really bad, but for */
       /* now use the brain dead approach. */
-      for (int lane = 0; lane < 3; ++lane) {
-        for (int i = 0; i < numThreads; ++i) {
+      for (int lane = 0; lane < 3; ++lane)
+      {
+        for (int i = 0; i < numThreads; ++i)
+        {
           RAJA::Index_type startRow = i * midDim / numThreads;
-          RAJA::Index_type endRow = (i + 1) * midDim / numThreads;
-          RAJA::Index_type start = startRow * fastDim;
-          RAJA::Index_type end = endRow * fastDim;
-          RAJA::Index_type len = end - start;
+          RAJA::Index_type endRow   = (i + 1) * midDim / numThreads;
+          RAJA::Index_type start    = startRow * fastDim;
+          RAJA::Index_type end      = endRow * fastDim;
+          RAJA::Index_type len      = end - start;
           // printf("%d %d\n", start + (lane  )*len/3,
           //                   start + (lane+1)*len/3  ) ;
           iset.push_back(RAJA::RangeSegment(start + (lane)*len / 3,
@@ -105,7 +115,9 @@ void buildLockFreeBlockIndexset(
         }
       }
     }
-  } else { /* 3d mesh */
+  }
+  else
+  { /* 3d mesh */
 
     // this requires dependence graph - commenting out for now
 
@@ -209,14 +221,14 @@ void buildLockFreeColorIndexset(
     RAJA::Index_type* elemPermutation,
     RAJA::Index_type* ielemPermutation)
 {
-  bool done = false;
+  bool done      = false;
   bool* isMarked = new bool[numEntity];
 
-  RAJA::Index_type numWorkset = 0;
+  RAJA::Index_type numWorkset    = 0;
   RAJA::Index_type* worksetDelim = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type worksetSize = 0;
-  RAJA::Index_type* workset = new RAJA::Index_type[numEntity];
+  RAJA::Index_type* workset    = new RAJA::Index_type[numEntity];
 
   RAJA::Index_type* rangeToDomain =
       new RAJA::Index_type[numEntityRange * numRangePerDomain];
@@ -225,12 +237,15 @@ void buildLockFreeColorIndexset(
   memset(rangeToDomainCount, 0, numEntityRange * sizeof(RAJA::Index_type));
 
   /* create an inverse mapping */
-  for (int i = 0; i < numEntity; ++i) {
-    for (int j = 0; j < numRangePerDomain; ++j) {
-      RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
+  for (int i = 0; i < numEntity; ++i)
+  {
+    for (int j = 0; j < numRangePerDomain; ++j)
+    {
+      RAJA::Index_type id  = domainToRange[i * numRangePerDomain + j];
       RAJA::Index_type idx = id * numRangePerDomain + rangeToDomainCount[id]++;
       if (idx > numEntityRange * numRangePerDomain ||
-          rangeToDomainCount[id] > numRangePerDomain) {
+          rangeToDomainCount[id] > numRangePerDomain)
+      {
         printf("foiled!\n");
         exit(-1);
       }
@@ -238,30 +253,39 @@ void buildLockFreeColorIndexset(
     }
   }
 
-  while (!done) {
+  while (!done)
+  {
     done = true;
 
-    for (int i = 0; i < numEntity; ++i) {
+    for (int i = 0; i < numEntity; ++i)
+    {
       isMarked[i] = false;
     }
 
-    for (int i = 0; i < worksetSize; ++i) {
+    for (int i = 0; i < worksetSize; ++i)
+    {
       isMarked[workset[i]] = true;
     }
 
-    for (int i = 0; i < numEntity; ++i) {
-      if (isMarked[i] == false) {
+    for (int i = 0; i < numEntity; ++i)
+    {
+      if (isMarked[i] == false)
+      {
         done = false;
-        if (worksetSize >= numEntity) {
+        if (worksetSize >= numEntity)
+        {
           printf("foiled!\n");
           exit(-1);
         }
         workset[worksetSize++] = i;
-        for (int j = 0; j < numRangePerDomain; ++j) {
+        for (int j = 0; j < numRangePerDomain; ++j)
+        {
           RAJA::Index_type id = domainToRange[i * numRangePerDomain + j];
-          for (int k = 0; k < rangeToDomainCount[id]; ++k) {
+          for (int k = 0; k < rangeToDomainCount[id]; ++k)
+          {
             RAJA::Index_type idx = rangeToDomain[id * numRangePerDomain + k];
-            if (idx < 0 || idx >= numEntity) {
+            if (idx < 0 || idx >= numEntity)
+            {
               printf("foiled!\n");
               exit(-1);
             }
@@ -270,7 +294,8 @@ void buildLockFreeColorIndexset(
         }
       }
     }
-    if (done == false) {
+    if (done == false)
+    {
       worksetDelim[numWorkset++] = worksetSize;
     }
   }
@@ -278,45 +303,58 @@ void buildLockFreeColorIndexset(
   delete[] rangeToDomainCount;
   delete[] rangeToDomain;
 
-  if (worksetSize != numEntity) {
+  if (worksetSize != numEntity)
+  {
     printf("foiled!!!\n");
     exit(-1);
   }
 
   /* we may want to create a permutation array here */
-  if (elemPermutation != 0l) {
+  if (elemPermutation != 0l)
+  {
     /* send back permutaion array, and corresponding range segments */
 
     memcpy(elemPermutation, &workset[0], numEntity * sizeof(int));
-    if (ielemPermutation != 0l) {
-      for (int i = 0; i < numEntity; ++i) {
+    if (ielemPermutation != 0l)
+    {
+      for (int i = 0; i < numEntity; ++i)
+      {
         ielemPermutation[elemPermutation[i]] = i;
       }
     }
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
+      end                    = worksetDelim[i];
       iset.push_back(RAJA::RangeSegment(begin, end));
     }
-  } else {
+  }
+  else
+  {
     RAJA::Index_type end = 0;
-    for (int i = 0; i < numWorkset; ++i) {
+    for (int i = 0; i < numWorkset; ++i)
+    {
       RAJA::Index_type begin = end;
-      end = worksetDelim[i];
-      bool isRange = true;
-      for (int j = begin + 1; j < end; ++j) {
-        if (workset[j - 1] + 1 != workset[j]) {
+      end                    = worksetDelim[i];
+      bool isRange           = true;
+      for (int j = begin + 1; j < end; ++j)
+      {
+        if (workset[j - 1] + 1 != workset[j])
+        {
           isRange = false;
           break;
         }
       }
-      if (isRange) {
+      if (isRange)
+      {
         iset.push_back(
             RAJA::RangeSegment(workset[begin], workset[end - 1] + 1));
-      } else {
-        iset.push_back(RAJA::ListSegment(&workset[begin], end - begin,
-                                         work_res));
+      }
+      else
+      {
+        iset.push_back(
+            RAJA::ListSegment(&workset[begin], end - begin, work_res));
         // printf("segment %d\n", i) ;
         // for (int j=begin; j<end; ++j) {
         //    printf("%d\n", workset[j]) ;
diff --git a/src/MemUtils_CUDA.cpp b/src/MemUtils_CUDA.cpp
index 85ead614d9..a54b13bcfc 100644
--- a/src/MemUtils_CUDA.cpp
+++ b/src/MemUtils_CUDA.cpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/policy/cuda/raja_cudaerrchk.hpp"
 
-
 namespace RAJA
 {
 
diff --git a/src/MemUtils_HIP.cpp b/src/MemUtils_HIP.cpp
index 97bd82775e..e50823205e 100644
--- a/src/MemUtils_HIP.cpp
+++ b/src/MemUtils_HIP.cpp
@@ -24,7 +24,6 @@
 
 #include "RAJA/policy/hip/raja_hiperrchk.hpp"
 
-
 namespace RAJA
 {
 
diff --git a/src/MemUtils_SYCL.cpp b/src/MemUtils_SYCL.cpp
index 0b5f1b8be6..dcf624a2c2 100644
--- a/src/MemUtils_SYCL.cpp
+++ b/src/MemUtils_SYCL.cpp
@@ -22,7 +22,6 @@
 
 #include "RAJA/policy/sycl/MemUtils_SYCL.hpp"
 
-
 namespace RAJA
 {
 
@@ -49,7 +48,7 @@ syclInfo tl_status;
 #endif
 
 //! State of raja sycl queue synchronization for sycl reducer objects
-std::unordered_map<cl::sycl::queue, bool> g_queue_info_map{
+std::unordered_map<cl::sycl::queue, bool> g_queue_info_map {
     {cl::sycl::queue(), true}};
 
 }  // namespace detail
diff --git a/src/PluginStrategy.cpp b/src/PluginStrategy.cpp
index e39c5718a8..eee0962fc4 100644
--- a/src/PluginStrategy.cpp
+++ b/src/PluginStrategy.cpp
@@ -9,22 +9,24 @@
 
 RAJA_INSTANTIATE_REGISTRY(PluginRegistry);
 
-namespace RAJA {
-namespace util {
+namespace RAJA
+{
+namespace util
+{
 
 PluginStrategy::PluginStrategy() = default;
 
-void PluginStrategy::init(const PluginOptions&) { }
+void PluginStrategy::init(const PluginOptions&) {}
 
-void PluginStrategy::preCapture(const PluginContext&) { }
+void PluginStrategy::preCapture(const PluginContext&) {}
 
-void PluginStrategy::postCapture(const PluginContext&) { }
+void PluginStrategy::postCapture(const PluginContext&) {}
 
-void PluginStrategy::preLaunch(const PluginContext&) { }
+void PluginStrategy::preLaunch(const PluginContext&) {}
 
-void PluginStrategy::postLaunch(const PluginContext&) { }
+void PluginStrategy::postLaunch(const PluginContext&) {}
 
-void PluginStrategy::finalize() { }
+void PluginStrategy::finalize() {}
 
-}
-}
+}  // namespace util
+}  // namespace RAJA
diff --git a/src/RuntimePluginLoader.cpp b/src/RuntimePluginLoader.cpp
index 3da10cda8c..988cac8f44 100644
--- a/src/RuntimePluginLoader.cpp
+++ b/src/RuntimePluginLoader.cpp
@@ -13,18 +13,20 @@
 #endif
 
 RAJA_INLINE
-bool
-isSharedObject(const std::string& filename)
+bool isSharedObject(const std::string& filename)
 {
-  return (filename.size() > 3 && !filename.compare(filename.size() - 3, 3, ".so"));
+  return (filename.size() > 3 &&
+          !filename.compare(filename.size() - 3, 3, ".so"));
 }
 
-namespace RAJA {
-namespace util {
-  
+namespace RAJA
+{
+namespace util
+{
+
 RuntimePluginLoader::RuntimePluginLoader()
 {
-  char *env = ::getenv("RAJA_PLUGINS");
+  char* env = ::getenv("RAJA_PLUGINS");
   if (nullptr == env)
   {
     return;
@@ -35,7 +37,7 @@ RuntimePluginLoader::RuntimePluginLoader()
 void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 {
   initDirectory(p.str);
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->init(p);
   }
@@ -43,7 +45,7 @@ void RuntimePluginLoader::init(const RAJA::util::PluginOptions& p)
 
 void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preCapture(p);
   }
@@ -51,7 +53,7 @@ void RuntimePluginLoader::preCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postCapture(p);
   }
@@ -59,7 +61,7 @@ void RuntimePluginLoader::postCapture(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->preLaunch(p);
   }
@@ -67,7 +69,7 @@ void RuntimePluginLoader::preLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->postLaunch(p);
   }
@@ -75,7 +77,7 @@ void RuntimePluginLoader::postLaunch(const RAJA::util::PluginContext& p)
 
 void RuntimePluginLoader::finalize()
 {
-  for (auto &plugin : plugins)
+  for (auto& plugin : plugins)
   {
     plugin->finalize();
   }
@@ -83,42 +85,44 @@ void RuntimePluginLoader::finalize()
 }
 
 // Initialize plugin from a shared object file specified by 'path'.
-void RuntimePluginLoader::initPlugin(const std::string &path)
+void RuntimePluginLoader::initPlugin(const std::string& path)
 {
-  #ifndef _WIN32
-  void *plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#ifndef _WIN32
+  void* plugin = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
   if (!plugin)
   {
     printf("[RuntimePluginLoader]: dlopen failed: %s\n", dlerror());
   }
 
-  RuntimePluginLoader::Parent *(*getPlugin)() = (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
+  RuntimePluginLoader::Parent* (*getPlugin)() =
+      (RuntimePluginLoader::Parent * (*)()) dlsym(plugin, "getPlugin");
 
   if (getPlugin)
   {
-    plugins.push_back(std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
+    plugins.push_back(
+        std::unique_ptr<RuntimePluginLoader::Parent>(getPlugin()));
   }
   else
   {
     printf("[RuntimePluginLoader]: dlsym failed: %s\n", dlerror());
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 // Initialize all plugins in a directory specified by 'path'.
-void RuntimePluginLoader::initDirectory(const std::string &path)
+void RuntimePluginLoader::initDirectory(const std::string& path)
 {
-  #ifndef _WIN32
+#ifndef _WIN32
   if (isSharedObject(path))
   {
     initPlugin(path);
     return;
   }
-  
-  DIR *dir;
-  struct dirent *file;
+
+  DIR* dir;
+  struct dirent* file;
 
   if ((dir = opendir(path.c_str())) != NULL)
   {
@@ -135,14 +139,16 @@ void RuntimePluginLoader::initDirectory(const std::string &path)
   {
     perror("[RuntimePluginLoader]: Could not open plugin directory");
   }
-  #else
+#else
   RAJA_UNUSED_ARG(path);
-  #endif
+#endif
 }
 
 void linkRuntimePluginLoader() {}
 
-} // end namespace util
-} // end namespace RAJA
+}  // end namespace util
+}  // end namespace RAJA
 
-static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P("RuntimePluginLoader", "Dynamically load RAJA plugins.");
+static RAJA::util::PluginRegistry::add<RAJA::util::RuntimePluginLoader> P(
+    "RuntimePluginLoader",
+    "Dynamically load RAJA plugins.");
diff --git a/src/TensorStats.cpp b/src/TensorStats.cpp
index b650b691f9..d34a6c8159 100644
--- a/src/TensorStats.cpp
+++ b/src/TensorStats.cpp
@@ -10,18 +10,18 @@
 
 int RAJA::tensor_stats::indent = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_copy = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy           = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_copy_ctor      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast_ctor = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_load_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_load_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_load_strided_n = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_store_strided = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed    = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_packed_n  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_store_strided   = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_store_strided_n = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
@@ -29,38 +29,39 @@ camp::idx_t RAJA::tensor_stats::num_vector_broadcast = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_get = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_set = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_add = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_add      = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_subtract = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_multiply = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_divide = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_divide   = 0;
 
 camp::idx_t RAJA::tensor_stats::num_vector_fma = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_fms = 0;
 
-camp::idx_t RAJA::tensor_stats::num_vector_sum = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_max = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_min = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_sum  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_max  = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_min  = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmax = 0;
 camp::idx_t RAJA::tensor_stats::num_vector_vmin = 0;
-camp::idx_t RAJA::tensor_stats::num_vector_dot = 0;
+camp::idx_t RAJA::tensor_stats::num_vector_dot  = 0;
 
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_row_row    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_row_row = 0;
-camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col = 0;
+camp::idx_t RAJA::tensor_stats::num_matrix_mm_mult_col_col    = 0;
 camp::idx_t RAJA::tensor_stats::num_matrix_mm_multacc_col_col = 0;
 
-void RAJA::tensor_stats::resetVectorStats(){
-  num_vector_copy = 0;
-  num_vector_copy_ctor = 0;
+void RAJA::tensor_stats::resetVectorStats()
+{
+  num_vector_copy           = 0;
+  num_vector_copy_ctor      = 0;
   num_vector_broadcast_ctor = 0;
 
-  num_vector_load_packed = 0;
-  num_vector_load_packed_n = 0;
-  num_vector_load_strided = 0;
-  num_vector_load_strided_n = 0;
-  num_vector_store_packed = 0;
-  num_vector_store_packed_n = 0;
-  num_vector_store_strided = 0;
+  num_vector_load_packed     = 0;
+  num_vector_load_packed_n   = 0;
+  num_vector_load_strided    = 0;
+  num_vector_load_strided_n  = 0;
+  num_vector_store_packed    = 0;
+  num_vector_store_packed_n  = 0;
+  num_vector_store_strided   = 0;
   num_vector_store_strided_n = 0;
 
   num_vector_broadcast = 0;
@@ -68,29 +69,34 @@ void RAJA::tensor_stats::resetVectorStats(){
   num_vector_get = 0;
   num_vector_set = 0;
 
-  num_vector_add = 0;
+  num_vector_add      = 0;
   num_vector_subtract = 0;
   num_vector_multiply = 0;
-  num_vector_divide = 0;
+  num_vector_divide   = 0;
 
-  num_vector_fma = 0;
-  num_vector_fms = 0;
-  num_vector_sum = 0;
-  num_vector_max = 0;
-  num_vector_min = 0;
+  num_vector_fma  = 0;
+  num_vector_fms  = 0;
+  num_vector_sum  = 0;
+  num_vector_max  = 0;
+  num_vector_min  = 0;
   num_vector_vmax = 0;
   num_vector_vmin = 0;
-  num_vector_dot = 0;
+  num_vector_dot  = 0;
 
-  num_matrix_mm_mult_row_row = 0;
+  num_matrix_mm_mult_row_row    = 0;
   num_matrix_mm_multacc_row_row = 0;
-  num_matrix_mm_mult_col_col = 0;
+  num_matrix_mm_mult_col_col    = 0;
   num_matrix_mm_multacc_col_col = 0;
 }
 
-#define PRINT_STAT(STAT) if(STAT){printf("  %-32s   %ld\n", #STAT, STAT);}
+#define PRINT_STAT(STAT)                                                       \
+  if (STAT)                                                                    \
+  {                                                                            \
+    printf("  %-32s   %ld\n", #STAT, STAT);                                    \
+  }
 
-void RAJA::tensor_stats::printVectorStats(){
+void RAJA::tensor_stats::printVectorStats()
+{
 
   printf("RAJA SIMD Register Statistics:\n");
 
@@ -129,5 +135,4 @@ void RAJA::tensor_stats::printVectorStats(){
   PRINT_STAT(num_matrix_mm_multacc_row_row);
   PRINT_STAT(num_matrix_mm_mult_col_col);
   PRINT_STAT(num_matrix_mm_multacc_col_col);
-
 }